From eba2c5ca45e30ee9937ef212231a83b61df94bf2 Mon Sep 17 00:00:00 2001 From: himeditator Date: Sat, 6 Sep 2025 20:49:46 +0800 Subject: [PATCH] =?UTF-8?q?feat(engine):=20=E9=87=8D=E6=9E=84=E5=AD=97?= =?UTF-8?q?=E5=B9=95=E5=BC=95=E6=93=8E=EF=BC=8C=E6=96=B0=E5=A2=9E=20Sherpa?= =?UTF-8?q?-ONNX=20SenseVoice=20=E8=AF=AD=E9=9F=B3=E8=AF=86=E5=88=AB?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 重构字幕引擎,将音频采集改为在新线程上进行 - 重构 audio2text 中的类,调整运行逻辑 - 更新 main 函数,添加对 Sosv 模型的支持 - 修改 AudioStream 类,默认使用 16000Hz 采样率 --- engine/audio2text/__init__.py | 4 +- engine/audio2text/gummy.py | 25 ++++- engine/audio2text/sosv.py | 139 ++++++++++++++++++++++++++ engine/audio2text/vosk.py | 8 ++ engine/main.py | 168 ++++++++++++++++++++++---------- engine/sysaudio/darwin.py | 40 +++++--- engine/sysaudio/linux.py | 11 +-- engine/sysaudio/win.py | 38 +++++--- engine/utils/__init__.py | 2 +- engine/utils/audioprcs.py | 25 +++++ engine/utils/server.py | 12 +-- engine/utils/shared.py | 8 ++ engine/utils/thdata.py | 5 - src/main/utils/CaptionEngine.ts | 4 +- 14 files changed, 377 insertions(+), 112 deletions(-) create mode 100644 engine/audio2text/sosv.py create mode 100644 engine/utils/shared.py delete mode 100644 engine/utils/thdata.py diff --git a/engine/audio2text/__init__.py b/engine/audio2text/__init__.py index 988d76e..a4e7f15 100644 --- a/engine/audio2text/__init__.py +++ b/engine/audio2text/__init__.py @@ -1,3 +1,3 @@ -from dashscope.common.error import InvalidParameter from .gummy import GummyRecognizer -from .vosk import VoskRecognizer \ No newline at end of file +from .vosk import VoskRecognizer +from .sosv import SosvRecognizer \ No newline at end of file diff --git a/engine/audio2text/gummy.py b/engine/audio2text/gummy.py index 7d95b9b..2f31a5c 100644 --- a/engine/audio2text/gummy.py +++ b/engine/audio2text/gummy.py @@ -5,9 +5,10 @@ from dashscope.audio.asr import ( TranslationRecognizerRealtime ) import dashscope +from dashscope.common.error import InvalidParameter from datetime import datetime -from utils import stdout_cmd, stdout_obj, stderr - +from utils import stdout_cmd, stdout_obj, stdout_err +from utils import shared_data class Callback(TranslationRecognizerCallback): """ @@ -90,9 +91,23 @@ class GummyRecognizer: """启动 Gummy 引擎""" self.translator.start() - def send_audio_frame(self, data): - """发送音频帧,擎将自动识别并将识别结果输出到标准输出中""" - self.translator.send_audio_frame(data) + def translate(self): + """持续读取共享数据中的音频帧,并进行语音识别,将识别结果输出到标准输出中""" + global shared_data + restart_count = 0 + while shared_data.status == 'running': + chunk = shared_data.chunk_queue.get() + try: + self.translator.send_audio_frame(chunk) + except InvalidParameter as e: + restart_count += 1 + if restart_count > 5: + stdout_err(str(e)) + shared_data.status = "kill" + stdout_cmd('kill') + break + else: + stdout_cmd('info', f'Gummy engine stopped, restart attempt: {restart_count}...') def stop(self): """停止 Gummy 引擎""" diff --git a/engine/audio2text/sosv.py b/engine/audio2text/sosv.py new file mode 100644 index 0000000..4c59ce4 --- /dev/null +++ b/engine/audio2text/sosv.py @@ -0,0 +1,139 @@ +""" +Shepra-ONNX SenseVoice Model + +This code file references the following: + +https://github.com/k2-fsa/sherpa-onnx/blob/master/python-api-examples/simulate-streaming-sense-voice-microphone.py +""" + +import time +from datetime import datetime +import sherpa_onnx +import numpy as np + +from utils import shared_data +from utils import stdout_cmd, stdout_obj +from utils import google_translate, ollama_translate + + +class SosvRecognizer: + """ + 使用 Sense Voice 非流式模型处理流式音频数据,并在标准输出中输出 Auto Caption 软件可读取的 JSON 字符串数据 + + 初始化参数: + model_path: Shepra ONNX Sense Voice 识别模型路径 + vad_model: Silero VAD 模型路径 + target: 翻译目标语言 + trans_model: 翻译模型名称 + ollama_name: Ollama 模型名称 + """ + def __init__(self, model_path: str, target: str | None, trans_model: str, ollama_name: str): + if model_path.startswith('"'): + model_path = model_path[1:] + if model_path.endswith('"'): + model_path = model_path[:-1] + self.model_path = model_path + self.target = target + if trans_model == 'google': + self.trans_func = google_translate + else: + self.trans_func = ollama_translate + self.ollama_name = ollama_name + + self.time_str = '' + self.cur_id = 0 + self.prev_content = '' + + def start(self): + """启动 Sense Voice 模型""" + self.recognizer = sherpa_onnx.OfflineRecognizer.from_sense_voice( + model=f"{self.model_path}/model.onnx", + tokens=f"{self.model_path}/tokens.txt", + num_threads = 2, + ) + config = sherpa_onnx.VadModelConfig() + config.silero_vad.model = f"{self.model_path}/silero_vad.onnx" + config.silero_vad.threshold = 0.5 + config.silero_vad.min_silence_duration = 0.1 + config.silero_vad.min_speech_duration = 0.25 + config.silero_vad.max_speech_duration = 8 + config.sample_rate = 16000 + self.window_size = config.silero_vad.window_size + self.vad = sherpa_onnx.VoiceActivityDetector(config, buffer_size_in_seconds=100) + self.buffer = [] + self.offset = 0 + self.started = False + self.started_time = .0 + self.time_str = datetime.now().strftime('%H:%M:%S.%f')[:-3] + stdout_cmd('info', 'Shepra ONNX Sense Voice recognizer started.') + + def send_audio_frame(self, data: bytes): + """ + 发送音频帧给 SOSV 引擎,引擎将自动识别并将识别结果输出到标准输出中 + + Args: + data: 音频帧数据,采样率必须为 16000Hz + """ + caption = {} + caption['command'] = 'caption' + caption['translation'] = '' + + data_np = np.frombuffer(data, dtype=np.int16).astype(np.float32) + self.buffer = np.concatenate([self.buffer, data_np]) + while self.offset + self.window_size < len(self.buffer): + self.vad.accept_waveform(self.buffer[self.offset: self.offset + self.window_size]) + if not self.started and self.vad.is_speech_detected(): + self.started = True + self.started_time = time.time() + self.offset += self.window_size + + if not self.started: + if len(self.buffer) > 10 * self.window_size: + self.offset -= len(self.buffer) - 10 * self.window_size + self.buffer = self.buffer[-10 * self.window_size:] + + if self.started and time.time() - self.started_time > 0.2: + stream = self.recognizer.create_stream() + stream.accept_waveform(16000, self.buffer) + self.recognizer.decode_stream(stream) + text = stream.result.text.strip() + if text and self.prev_content != text: + caption['index'] = self.cur_id + caption['text'] = text + caption['time_s'] = self.time_str + caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3] + self.prev_content = text + stdout_obj(caption) + self.started_time = time.time() + + while not self.vad.empty(): + stream = self.recognizer.create_stream() + stream.accept_waveform(16000, self.vad.front.samples) + self.vad.pop() + self.recognizer.decode_stream(stream) + text = stream.result.text.strip() + + caption['index'] = self.cur_id + caption['text'] = text + caption['time_s'] = self.time_str + caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3] + self.prev_content = '' + stdout_obj(caption) + + self.cur_id += 1 + self.time_str = datetime.now().strftime('%H:%M:%S.%f')[:-3] + self.buffer = [] + self.offset = 0 + self.started = False + self.started_time = .0 + + def translate(self): + """持续读取共享数据中的音频帧,并进行语音识别,将识别结果输出到标准输出中""" + global shared_data + while shared_data.status == 'running': + chunk = shared_data.chunk_queue.get() + self.send_audio_frame(chunk) + + def stop(self): + """停止 Sense Voice 模型""" + stdout_cmd('info', 'Shepra ONNX Sense Voice recognizer closed.') diff --git a/engine/audio2text/vosk.py b/engine/audio2text/vosk.py index 850a85d..600f866 100644 --- a/engine/audio2text/vosk.py +++ b/engine/audio2text/vosk.py @@ -4,6 +4,7 @@ import time from datetime import datetime from vosk import Model, KaldiRecognizer, SetLogLevel +from utils import shared_data from utils import stdout_cmd, stdout_obj, google_translate, ollama_translate @@ -82,6 +83,13 @@ class VoskRecognizer: stdout_obj(caption) + def translate(self): + """持续读取共享数据中的音频帧,并进行语音识别,将识别结果输出到标准输出中""" + global shared_data + while shared_data.status == 'running': + chunk = shared_data.chunk_queue.get() + self.send_audio_frame(chunk) + def stop(self): """停止 Vosk 引擎""" stdout_cmd('info', 'Vosk recognizer closed.') \ No newline at end of file diff --git a/engine/main.py b/engine/main.py index dd01e7c..d1d6177 100644 --- a/engine/main.py +++ b/engine/main.py @@ -1,70 +1,120 @@ +import wave import argparse -from utils import stdout_cmd, stdout_err -from utils import thread_data, start_server +import threading +from utils import stdout, stdout_cmd +from utils import shared_data, start_server from utils import merge_chunk_channels, resample_chunk_mono -from audio2text import InvalidParameter, GummyRecognizer +from audio2text import GummyRecognizer from audio2text import VoskRecognizer +from audio2text import SosvRecognizer from sysaudio import AudioStream +def audio_recording(stream: AudioStream, resample: bool, save = False): + global shared_data + stream.open_stream() + if save: + wf = wave.open(f'record.wav', 'wb') + wf.setnchannels(1) + wf.setsampwidth(stream.SAMP_WIDTH) + wf.setframerate(16000) + while shared_data.status == 'running': + raw_chunk = stream.read_chunk() + if raw_chunk is None: continue + if resample: + chunk = resample_chunk_mono(raw_chunk, stream.CHANNELS, stream.RATE, 16000) + else: + chunk = merge_chunk_channels(raw_chunk, stream.CHANNELS) + shared_data.chunk_queue.put(chunk) + if save: wf.writeframes(chunk) # type: ignore + if save: wf.close() # type: ignore + stream.close_stream_signal() + + def main_gummy(s: str, t: str, a: int, c: int, k: str): - global thread_data + """ + Parameters: + s: Source language + t: Target language + k: Aliyun Bailian API key + """ stream = AudioStream(a, c) if t == 'none': engine = GummyRecognizer(stream.RATE, s, None, k) else: engine = GummyRecognizer(stream.RATE, s, t, k) - stream.open_stream() engine.start() - chunk_mono = bytes() - - restart_count = 0 - while thread_data.status == "running": - try: - chunk = stream.read_chunk() - if chunk is None: continue - chunk_mono = merge_chunk_channels(chunk, stream.CHANNELS) - try: - engine.send_audio_frame(chunk_mono) - except InvalidParameter as e: - restart_count += 1 - if restart_count > 5: - stdout_err(str(e)) - thread_data.status = "kill" - stdout_cmd('kill') - break - else: - stdout_cmd('info', f'Gummy engine stopped, restart attempt: {restart_count}...') - except KeyboardInterrupt: - break - - engine.send_audio_frame(chunk_mono) - stream.close_stream() + stream_thread = threading.Thread( + target=audio_recording, + args=(stream, False), + daemon=True + ) + stream_thread.start() + try: + engine.translate() + except KeyboardInterrupt: + stdout("Keyboard interrupt detected. Exiting...") engine.stop() -def main_vosk(a: int, c: int, m: str, t: str, tm: str, on: str): - global thread_data +def main_vosk(a: int, c: int, vosk: str, t: str, tm: str, omn: str): + """ + Parameters: + a: Audio source: 0 for output, 1 for input + c: Chunk number in 1 second + vosk: Vosk model path + t: Target language + tm: Translation model type, ollama or google + omn: Ollama model name + """ stream = AudioStream(a, c) - engine = VoskRecognizer( - m, None if t == 'none' else t, - tm, on - ) + if t == 'none': + engine = VoskRecognizer(vosk, None, tm, omn) + else: + engine = VoskRecognizer(vosk, t, tm, omn) - stream.open_stream() engine.start() + stream_thread = threading.Thread( + target=audio_recording, + args=(stream, True), + daemon=True + ) + stream_thread.start() + try: + engine.translate() + except KeyboardInterrupt: + stdout("Keyboard interrupt detected. Exiting...") + engine.stop() - while thread_data.status == "running": - try: - chunk = stream.read_chunk() - if chunk is None: continue - chunk_mono = resample_chunk_mono(chunk, stream.CHANNELS, stream.RATE, 16000) - engine.send_audio_frame(chunk_mono) - except KeyboardInterrupt: - break - stream.close_stream() +def main_sosv(a: int, c: int, sosv: str, t: str, tm: str, omn: str): + """ + Parameters: + a: Audio source: 0 for output, 1 for input + c: Chunk number in 1 second + sosv: Sherpa-ONNX SenseVoice model path + t: Target language + tm: Translation model type, ollama or google + omn: Ollama model name + """ + stream = AudioStream(a, c) + if t == 'none': + engine = SosvRecognizer(sosv, None, tm, omn) + else: + engine = SosvRecognizer(sosv, t, tm, omn) + + engine.start() + stream_thread = threading.Thread( + target=audio_recording, + args=(stream, True), + daemon=True + ) + stream_thread.start() + try: + engine.translate() + except KeyboardInterrupt: + stdout("Keyboard interrupt detected. Exiting...") engine.stop() @@ -74,22 +124,25 @@ if __name__ == "__main__": parser.add_argument('-e', '--caption_engine', default='gummy', help='Caption engine: gummy or vosk') parser.add_argument('-a', '--audio_type', default=0, help='Audio stream source: 0 for output, 1 for input') parser.add_argument('-c', '--chunk_rate', default=10, help='Number of audio stream chunks collected per second') - parser.add_argument('-p', '--port', default=8080, help='The port to run the server on, 0 for no server') + parser.add_argument('-p', '--port', default=0, help='The port to run the server on, 0 for no server') parser.add_argument('-t', '--target_language', default='zh', help='Target language code, "none" for no translation') # gummy only parser.add_argument('-s', '--source_language', default='en', help='Source language code') parser.add_argument('-k', '--api_key', default='', help='API KEY for Gummy model') + # vosk and sosv + parser.add_argument('-tm', '--translation_model', default='ollama', help='Model for translation: ollama or google') + parser.add_argument('-omn', '--ollama_name', default='', help='Ollama model name for translation') # vosk only - parser.add_argument('-m', '--model_path', default='', help='The path to the vosk model.') - parser.add_argument('-tm', '--translation_model', default='', help='Google translate API KEY') - parser.add_argument('-on', '--ollama_name', default='', help='Ollama model name for translation') + parser.add_argument('-vosk', '--vosk_model', default='', help='The path to the vosk model.') + # sosv only + parser.add_argument('-sosv', '--sosv_model', default=None, help='The SenseVoice model path') args = parser.parse_args() if int(args.port) == 0: - thread_data.status = "running" + shared_data.status = "running" else: start_server(int(args.port)) - + if args.caption_engine == 'gummy': main_gummy( args.source_language, @@ -102,7 +155,16 @@ if __name__ == "__main__": main_vosk( int(args.audio_type), int(args.chunk_rate), - args.model_path, + args.vosk_model, + args.target_language, + args.translation_model, + args.ollama_name + ) + elif args.caption_engine == 'sosv': + main_sosv( + int(args.audio_type), + int(args.chunk_rate), + args.sosv_model, args.target_language, args.translation_model, args.ollama_name @@ -110,5 +172,5 @@ if __name__ == "__main__": else: raise ValueError('Invalid caption engine specified.') - if thread_data.status == "kill": + if shared_data.status == "kill": stdout_cmd('kill') diff --git a/engine/sysaudio/darwin.py b/engine/sysaudio/darwin.py index 6f32487..c642732 100644 --- a/engine/sysaudio/darwin.py +++ b/engine/sysaudio/darwin.py @@ -37,14 +37,13 @@ class AudioStream: self.FORMAT = pyaudio.paInt16 self.SAMP_WIDTH = pyaudio.get_sample_size(self.FORMAT) self.CHANNELS = int(self.device["maxInputChannels"]) - self.RATE = int(self.device["defaultSampleRate"]) - self.CHUNK = self.RATE // chunk_rate + self.DEFAULT_RATE = int(self.device["defaultSampleRate"]) + self.CHUNK_RATE = chunk_rate - def reset_chunk_size(self, chunk_size: int): - """ - 重新设置音频块大小 - """ - self.CHUNK = chunk_size + self.RATE = 16000 + self.CHUNK = self.RATE // self.CHUNK_RATE + self.open_stream() + self.close_stream() def get_info(self): dev_info = f""" @@ -72,16 +71,27 @@ class AudioStream: 打开并返回系统音频输出流 """ if self.stream: return self.stream - self.stream = self.mic.open( - format = self.FORMAT, - channels = int(self.CHANNELS), - rate = self.RATE, - input = True, - input_device_index = int(self.INDEX) - ) + try: + self.stream = self.mic.open( + format = self.FORMAT, + channels = int(self.CHANNELS), + rate = self.RATE, + input = True, + input_device_index = int(self.INDEX) + ) + except OSError: + self.RATE = self.DEFAULT_RATE + self.CHUNK = self.RATE // self.CHUNK_RATE + self.stream = self.mic.open( + format = self.FORMAT, + channels = int(self.CHANNELS), + rate = self.RATE, + input = True, + input_device_index = int(self.INDEX) + ) return self.stream - def read_chunk(self): + def read_chunk(self) -> bytes | None: """ 读取音频数据 """ diff --git a/engine/sysaudio/linux.py b/engine/sysaudio/linux.py index 4599674..4a19062 100644 --- a/engine/sysaudio/linux.py +++ b/engine/sysaudio/linux.py @@ -55,15 +55,10 @@ class AudioStream: self.FORMAT = 16 self.SAMP_WIDTH = 2 self.CHANNELS = 2 - self.RATE = 48000 + self.RATE = 16000 + self.CHUNK_RATE = chunk_rate self.CHUNK = self.RATE // chunk_rate - def reset_chunk_size(self, chunk_size: int): - """ - 重新设置音频块大小 - """ - self.CHUNK = chunk_size - def get_info(self): dev_info = f""" 音频捕获进程: @@ -84,7 +79,7 @@ class AudioStream: 启动音频捕获进程 """ self.process = subprocess.Popen( - ["parec", "-d", self.source, "--format=s16le", "--rate=48000", "--channels=2"], + ["parec", "-d", self.source, "--format=s16le", "--rate=16000", "--channels=2"], stdout=subprocess.PIPE ) diff --git a/engine/sysaudio/win.py b/engine/sysaudio/win.py index fcf5b49..4d1e3a3 100644 --- a/engine/sysaudio/win.py +++ b/engine/sysaudio/win.py @@ -61,14 +61,13 @@ class AudioStream: self.FORMAT = pyaudio.paInt16 self.SAMP_WIDTH = pyaudio.get_sample_size(self.FORMAT) self.CHANNELS = int(self.device["maxInputChannels"]) - self.RATE = int(self.device["defaultSampleRate"]) - self.CHUNK = self.RATE // chunk_rate + self.DEFAULT_RATE = int(self.device["defaultSampleRate"]) + self.CHUNK_RATE = chunk_rate - def reset_chunk_size(self, chunk_size: int): - """ - 重新设置音频块大小 - """ - self.CHUNK = chunk_size + self.RATE = 16000 + self.CHUNK = self.RATE // self.CHUNK_RATE + self.open_stream() + self.close_stream() def get_info(self): dev_info = f""" @@ -96,13 +95,24 @@ class AudioStream: 打开并返回系统音频输出流 """ if self.stream: return self.stream - self.stream = self.mic.open( - format = self.FORMAT, - channels = self.CHANNELS, - rate = self.RATE, - input = True, - input_device_index = self.INDEX - ) + try: + self.stream = self.mic.open( + format = self.FORMAT, + channels = self.CHANNELS, + rate = self.RATE, + input = True, + input_device_index = self.INDEX + ) + except OSError: + self.RATE = self.DEFAULT_RATE + self.CHUNK = self.RATE // self.CHUNK_RATE + self.stream = self.mic.open( + format = self.FORMAT, + channels = self.CHANNELS, + rate = self.RATE, + input = True, + input_device_index = self.INDEX + ) return self.stream def read_chunk(self) -> bytes | None: diff --git a/engine/utils/__init__.py b/engine/utils/__init__.py index 0e07ecf..6950ff2 100644 --- a/engine/utils/__init__.py +++ b/engine/utils/__init__.py @@ -5,6 +5,6 @@ from .audioprcs import ( resample_mono_chunk ) from .sysout import stdout, stdout_err, stdout_cmd, stdout_obj, stderr -from .thdata import thread_data +from .shared import shared_data from .server import start_server from .translation import ollama_translate, google_translate \ No newline at end of file diff --git a/engine/utils/audioprcs.py b/engine/utils/audioprcs.py index d1e664a..5422a30 100644 --- a/engine/utils/audioprcs.py +++ b/engine/utils/audioprcs.py @@ -49,9 +49,18 @@ def resample_chunk_mono(chunk: bytes, channels: int, orig_sr: int, target_sr: in # (length,) chunk_mono = np.mean(chunk_np.astype(np.float32), axis=1) + if orig_sr == target_sr: + return chunk_mono.astype(np.int16).tobytes() + ratio = target_sr / orig_sr chunk_mono_r = samplerate.resample(chunk_mono, ratio, converter_type=mode) chunk_mono_r = np.round(chunk_mono_r).astype(np.int16) + real_len = round(chunk_mono.shape[0] * ratio) + if(chunk_mono_r.shape[0] > real_len): + chunk_mono_r = chunk_mono_r[:real_len] + else: + while chunk_mono_r.shape[0] < real_len: + chunk_mono_r = np.append(chunk_mono_r, chunk_mono_r[-1]) return chunk_mono_r.tobytes() @@ -81,9 +90,18 @@ def resample_chunk_mono_np(chunk: bytes, channels: int, orig_sr: int, target_sr: # (length,) chunk_mono = np.mean(chunk_np.astype(np.float32), axis=1) + if orig_sr == target_sr: + return chunk_mono.astype(dtype) + ratio = target_sr / orig_sr chunk_mono_r = samplerate.resample(chunk_mono, ratio, converter_type=mode) chunk_mono_r = chunk_mono_r.astype(dtype) + real_len = round(chunk_mono.shape[0] * ratio) + if(chunk_mono_r.shape[0] > real_len): + chunk_mono_r = chunk_mono_r[:real_len] + else: + while chunk_mono_r.shape[0] < real_len: + chunk_mono_r = np.append(chunk_mono_r, chunk_mono_r[-1]) return chunk_mono_r @@ -100,9 +118,16 @@ def resample_mono_chunk(chunk: bytes, orig_sr: int, target_sr: int, mode="sinc_b Return: 单通道音频数据块 """ + if orig_sr == target_sr: return chunk chunk_np = np.frombuffer(chunk, dtype=np.int16) chunk_np = chunk_np.astype(np.float32) ratio = target_sr / orig_sr chunk_r = samplerate.resample(chunk_np, ratio, converter_type=mode) chunk_r = np.round(chunk_r).astype(np.int16) + real_len = round(chunk_np.shape[0] * ratio) + if(chunk_r.shape[0] > real_len): + chunk_r = chunk_r[:real_len] + else: + while chunk_r.shape[0] < real_len: + chunk_r = np.append(chunk_r, chunk_r[-1]) return chunk_r.tobytes() diff --git a/engine/utils/server.py b/engine/utils/server.py index f16d299..c8c3681 100644 --- a/engine/utils/server.py +++ b/engine/utils/server.py @@ -1,13 +1,12 @@ import socket import threading import json -# import time -from utils import thread_data, stdout_cmd, stderr +from utils import shared_data, stdout_cmd, stderr def handle_client(client_socket): - global thread_data - while thread_data.status == 'running': + global shared_data + while shared_data.status == 'running': try: data = client_socket.recv(4096).decode('utf-8') if not data: @@ -15,13 +14,13 @@ def handle_client(client_socket): data = json.loads(data) if data['command'] == 'stop': - thread_data.status = 'stop' + shared_data.status = 'stop' break except Exception as e: stderr(f'Communication error: {e}') break - thread_data.status = 'stop' + shared_data.status = 'stop' client_socket.close() @@ -34,7 +33,6 @@ def start_server(port: int): stderr(str(e)) stdout_cmd('kill') return - # time.sleep(20) stdout_cmd('connect') client, addr = server.accept() diff --git a/engine/utils/shared.py b/engine/utils/shared.py new file mode 100644 index 0000000..60c3a69 --- /dev/null +++ b/engine/utils/shared.py @@ -0,0 +1,8 @@ +import queue + +class SharedData: + def __init__(self): + self.status = "running" + self.chunk_queue = queue.Queue() + +shared_data = SharedData() \ No newline at end of file diff --git a/engine/utils/thdata.py b/engine/utils/thdata.py deleted file mode 100644 index 656880f..0000000 --- a/engine/utils/thdata.py +++ /dev/null @@ -1,5 +0,0 @@ -class ThreadData: - def __init__(self): - self.status = "running" - -thread_data = ThreadData() \ No newline at end of file diff --git a/src/main/utils/CaptionEngine.ts b/src/main/utils/CaptionEngine.ts index 516f0ae..39e371a 100644 --- a/src/main/utils/CaptionEngine.ts +++ b/src/main/utils/CaptionEngine.ts @@ -81,9 +81,9 @@ export class CaptionEngine { } else if(allConfig.controls.engine === 'vosk'){ this.command.push('-e', 'vosk') - this.command.push('-m', `"${allConfig.controls.modelPath}"`) + this.command.push('-vosk', `"${allConfig.controls.modelPath}"`) this.command.push('-tm', allConfig.controls.transModel) - this.command.push('-on', allConfig.controls.ollamaName) + this.command.push('-omn', allConfig.controls.ollamaName) } } Log.info('Engine Path:', this.appPath)