mirror of
https://github.com/HiMeditator/auto-caption.git
synced 2026-02-04 04:14:42 +08:00
feat(engine): 重构字幕引擎,新增 Sherpa-ONNX SenseVoice 语音识别模型
- 重构字幕引擎,将音频采集改为在新线程上进行 - 重构 audio2text 中的类,调整运行逻辑 - 更新 main 函数,添加对 Sosv 模型的支持 - 修改 AudioStream 类,默认使用 16000Hz 采样率
This commit is contained in:
@@ -1,3 +1,3 @@
|
|||||||
from dashscope.common.error import InvalidParameter
|
|
||||||
from .gummy import GummyRecognizer
|
from .gummy import GummyRecognizer
|
||||||
from .vosk import VoskRecognizer
|
from .vosk import VoskRecognizer
|
||||||
|
from .sosv import SosvRecognizer
|
||||||
@@ -5,9 +5,10 @@ from dashscope.audio.asr import (
|
|||||||
TranslationRecognizerRealtime
|
TranslationRecognizerRealtime
|
||||||
)
|
)
|
||||||
import dashscope
|
import dashscope
|
||||||
|
from dashscope.common.error import InvalidParameter
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from utils import stdout_cmd, stdout_obj, stderr
|
from utils import stdout_cmd, stdout_obj, stdout_err
|
||||||
|
from utils import shared_data
|
||||||
|
|
||||||
class Callback(TranslationRecognizerCallback):
|
class Callback(TranslationRecognizerCallback):
|
||||||
"""
|
"""
|
||||||
@@ -90,9 +91,23 @@ class GummyRecognizer:
|
|||||||
"""启动 Gummy 引擎"""
|
"""启动 Gummy 引擎"""
|
||||||
self.translator.start()
|
self.translator.start()
|
||||||
|
|
||||||
def send_audio_frame(self, data):
|
def translate(self):
|
||||||
"""发送音频帧,擎将自动识别并将识别结果输出到标准输出中"""
|
"""持续读取共享数据中的音频帧,并进行语音识别,将识别结果输出到标准输出中"""
|
||||||
self.translator.send_audio_frame(data)
|
global shared_data
|
||||||
|
restart_count = 0
|
||||||
|
while shared_data.status == 'running':
|
||||||
|
chunk = shared_data.chunk_queue.get()
|
||||||
|
try:
|
||||||
|
self.translator.send_audio_frame(chunk)
|
||||||
|
except InvalidParameter as e:
|
||||||
|
restart_count += 1
|
||||||
|
if restart_count > 5:
|
||||||
|
stdout_err(str(e))
|
||||||
|
shared_data.status = "kill"
|
||||||
|
stdout_cmd('kill')
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
stdout_cmd('info', f'Gummy engine stopped, restart attempt: {restart_count}...')
|
||||||
|
|
||||||
def stop(self):
|
def stop(self):
|
||||||
"""停止 Gummy 引擎"""
|
"""停止 Gummy 引擎"""
|
||||||
|
|||||||
139
engine/audio2text/sosv.py
Normal file
139
engine/audio2text/sosv.py
Normal file
@@ -0,0 +1,139 @@
|
|||||||
|
"""
|
||||||
|
Shepra-ONNX SenseVoice Model
|
||||||
|
|
||||||
|
This code file references the following:
|
||||||
|
|
||||||
|
https://github.com/k2-fsa/sherpa-onnx/blob/master/python-api-examples/simulate-streaming-sense-voice-microphone.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import time
|
||||||
|
from datetime import datetime
|
||||||
|
import sherpa_onnx
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from utils import shared_data
|
||||||
|
from utils import stdout_cmd, stdout_obj
|
||||||
|
from utils import google_translate, ollama_translate
|
||||||
|
|
||||||
|
|
||||||
|
class SosvRecognizer:
|
||||||
|
"""
|
||||||
|
使用 Sense Voice 非流式模型处理流式音频数据,并在标准输出中输出 Auto Caption 软件可读取的 JSON 字符串数据
|
||||||
|
|
||||||
|
初始化参数:
|
||||||
|
model_path: Shepra ONNX Sense Voice 识别模型路径
|
||||||
|
vad_model: Silero VAD 模型路径
|
||||||
|
target: 翻译目标语言
|
||||||
|
trans_model: 翻译模型名称
|
||||||
|
ollama_name: Ollama 模型名称
|
||||||
|
"""
|
||||||
|
def __init__(self, model_path: str, target: str | None, trans_model: str, ollama_name: str):
|
||||||
|
if model_path.startswith('"'):
|
||||||
|
model_path = model_path[1:]
|
||||||
|
if model_path.endswith('"'):
|
||||||
|
model_path = model_path[:-1]
|
||||||
|
self.model_path = model_path
|
||||||
|
self.target = target
|
||||||
|
if trans_model == 'google':
|
||||||
|
self.trans_func = google_translate
|
||||||
|
else:
|
||||||
|
self.trans_func = ollama_translate
|
||||||
|
self.ollama_name = ollama_name
|
||||||
|
|
||||||
|
self.time_str = ''
|
||||||
|
self.cur_id = 0
|
||||||
|
self.prev_content = ''
|
||||||
|
|
||||||
|
def start(self):
|
||||||
|
"""启动 Sense Voice 模型"""
|
||||||
|
self.recognizer = sherpa_onnx.OfflineRecognizer.from_sense_voice(
|
||||||
|
model=f"{self.model_path}/model.onnx",
|
||||||
|
tokens=f"{self.model_path}/tokens.txt",
|
||||||
|
num_threads = 2,
|
||||||
|
)
|
||||||
|
config = sherpa_onnx.VadModelConfig()
|
||||||
|
config.silero_vad.model = f"{self.model_path}/silero_vad.onnx"
|
||||||
|
config.silero_vad.threshold = 0.5
|
||||||
|
config.silero_vad.min_silence_duration = 0.1
|
||||||
|
config.silero_vad.min_speech_duration = 0.25
|
||||||
|
config.silero_vad.max_speech_duration = 8
|
||||||
|
config.sample_rate = 16000
|
||||||
|
self.window_size = config.silero_vad.window_size
|
||||||
|
self.vad = sherpa_onnx.VoiceActivityDetector(config, buffer_size_in_seconds=100)
|
||||||
|
self.buffer = []
|
||||||
|
self.offset = 0
|
||||||
|
self.started = False
|
||||||
|
self.started_time = .0
|
||||||
|
self.time_str = datetime.now().strftime('%H:%M:%S.%f')[:-3]
|
||||||
|
stdout_cmd('info', 'Shepra ONNX Sense Voice recognizer started.')
|
||||||
|
|
||||||
|
def send_audio_frame(self, data: bytes):
|
||||||
|
"""
|
||||||
|
发送音频帧给 SOSV 引擎,引擎将自动识别并将识别结果输出到标准输出中
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: 音频帧数据,采样率必须为 16000Hz
|
||||||
|
"""
|
||||||
|
caption = {}
|
||||||
|
caption['command'] = 'caption'
|
||||||
|
caption['translation'] = ''
|
||||||
|
|
||||||
|
data_np = np.frombuffer(data, dtype=np.int16).astype(np.float32)
|
||||||
|
self.buffer = np.concatenate([self.buffer, data_np])
|
||||||
|
while self.offset + self.window_size < len(self.buffer):
|
||||||
|
self.vad.accept_waveform(self.buffer[self.offset: self.offset + self.window_size])
|
||||||
|
if not self.started and self.vad.is_speech_detected():
|
||||||
|
self.started = True
|
||||||
|
self.started_time = time.time()
|
||||||
|
self.offset += self.window_size
|
||||||
|
|
||||||
|
if not self.started:
|
||||||
|
if len(self.buffer) > 10 * self.window_size:
|
||||||
|
self.offset -= len(self.buffer) - 10 * self.window_size
|
||||||
|
self.buffer = self.buffer[-10 * self.window_size:]
|
||||||
|
|
||||||
|
if self.started and time.time() - self.started_time > 0.2:
|
||||||
|
stream = self.recognizer.create_stream()
|
||||||
|
stream.accept_waveform(16000, self.buffer)
|
||||||
|
self.recognizer.decode_stream(stream)
|
||||||
|
text = stream.result.text.strip()
|
||||||
|
if text and self.prev_content != text:
|
||||||
|
caption['index'] = self.cur_id
|
||||||
|
caption['text'] = text
|
||||||
|
caption['time_s'] = self.time_str
|
||||||
|
caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3]
|
||||||
|
self.prev_content = text
|
||||||
|
stdout_obj(caption)
|
||||||
|
self.started_time = time.time()
|
||||||
|
|
||||||
|
while not self.vad.empty():
|
||||||
|
stream = self.recognizer.create_stream()
|
||||||
|
stream.accept_waveform(16000, self.vad.front.samples)
|
||||||
|
self.vad.pop()
|
||||||
|
self.recognizer.decode_stream(stream)
|
||||||
|
text = stream.result.text.strip()
|
||||||
|
|
||||||
|
caption['index'] = self.cur_id
|
||||||
|
caption['text'] = text
|
||||||
|
caption['time_s'] = self.time_str
|
||||||
|
caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3]
|
||||||
|
self.prev_content = ''
|
||||||
|
stdout_obj(caption)
|
||||||
|
|
||||||
|
self.cur_id += 1
|
||||||
|
self.time_str = datetime.now().strftime('%H:%M:%S.%f')[:-3]
|
||||||
|
self.buffer = []
|
||||||
|
self.offset = 0
|
||||||
|
self.started = False
|
||||||
|
self.started_time = .0
|
||||||
|
|
||||||
|
def translate(self):
|
||||||
|
"""持续读取共享数据中的音频帧,并进行语音识别,将识别结果输出到标准输出中"""
|
||||||
|
global shared_data
|
||||||
|
while shared_data.status == 'running':
|
||||||
|
chunk = shared_data.chunk_queue.get()
|
||||||
|
self.send_audio_frame(chunk)
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
"""停止 Sense Voice 模型"""
|
||||||
|
stdout_cmd('info', 'Shepra ONNX Sense Voice recognizer closed.')
|
||||||
@@ -4,6 +4,7 @@ import time
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from vosk import Model, KaldiRecognizer, SetLogLevel
|
from vosk import Model, KaldiRecognizer, SetLogLevel
|
||||||
|
from utils import shared_data
|
||||||
from utils import stdout_cmd, stdout_obj, google_translate, ollama_translate
|
from utils import stdout_cmd, stdout_obj, google_translate, ollama_translate
|
||||||
|
|
||||||
|
|
||||||
@@ -82,6 +83,13 @@ class VoskRecognizer:
|
|||||||
|
|
||||||
stdout_obj(caption)
|
stdout_obj(caption)
|
||||||
|
|
||||||
|
def translate(self):
|
||||||
|
"""持续读取共享数据中的音频帧,并进行语音识别,将识别结果输出到标准输出中"""
|
||||||
|
global shared_data
|
||||||
|
while shared_data.status == 'running':
|
||||||
|
chunk = shared_data.chunk_queue.get()
|
||||||
|
self.send_audio_frame(chunk)
|
||||||
|
|
||||||
def stop(self):
|
def stop(self):
|
||||||
"""停止 Vosk 引擎"""
|
"""停止 Vosk 引擎"""
|
||||||
stdout_cmd('info', 'Vosk recognizer closed.')
|
stdout_cmd('info', 'Vosk recognizer closed.')
|
||||||
166
engine/main.py
166
engine/main.py
@@ -1,70 +1,120 @@
|
|||||||
|
import wave
|
||||||
import argparse
|
import argparse
|
||||||
from utils import stdout_cmd, stdout_err
|
import threading
|
||||||
from utils import thread_data, start_server
|
from utils import stdout, stdout_cmd
|
||||||
|
from utils import shared_data, start_server
|
||||||
from utils import merge_chunk_channels, resample_chunk_mono
|
from utils import merge_chunk_channels, resample_chunk_mono
|
||||||
from audio2text import InvalidParameter, GummyRecognizer
|
from audio2text import GummyRecognizer
|
||||||
from audio2text import VoskRecognizer
|
from audio2text import VoskRecognizer
|
||||||
|
from audio2text import SosvRecognizer
|
||||||
from sysaudio import AudioStream
|
from sysaudio import AudioStream
|
||||||
|
|
||||||
|
|
||||||
|
def audio_recording(stream: AudioStream, resample: bool, save = False):
|
||||||
|
global shared_data
|
||||||
|
stream.open_stream()
|
||||||
|
if save:
|
||||||
|
wf = wave.open(f'record.wav', 'wb')
|
||||||
|
wf.setnchannels(1)
|
||||||
|
wf.setsampwidth(stream.SAMP_WIDTH)
|
||||||
|
wf.setframerate(16000)
|
||||||
|
while shared_data.status == 'running':
|
||||||
|
raw_chunk = stream.read_chunk()
|
||||||
|
if raw_chunk is None: continue
|
||||||
|
if resample:
|
||||||
|
chunk = resample_chunk_mono(raw_chunk, stream.CHANNELS, stream.RATE, 16000)
|
||||||
|
else:
|
||||||
|
chunk = merge_chunk_channels(raw_chunk, stream.CHANNELS)
|
||||||
|
shared_data.chunk_queue.put(chunk)
|
||||||
|
if save: wf.writeframes(chunk) # type: ignore
|
||||||
|
if save: wf.close() # type: ignore
|
||||||
|
stream.close_stream_signal()
|
||||||
|
|
||||||
|
|
||||||
def main_gummy(s: str, t: str, a: int, c: int, k: str):
|
def main_gummy(s: str, t: str, a: int, c: int, k: str):
|
||||||
global thread_data
|
"""
|
||||||
|
Parameters:
|
||||||
|
s: Source language
|
||||||
|
t: Target language
|
||||||
|
k: Aliyun Bailian API key
|
||||||
|
"""
|
||||||
stream = AudioStream(a, c)
|
stream = AudioStream(a, c)
|
||||||
if t == 'none':
|
if t == 'none':
|
||||||
engine = GummyRecognizer(stream.RATE, s, None, k)
|
engine = GummyRecognizer(stream.RATE, s, None, k)
|
||||||
else:
|
else:
|
||||||
engine = GummyRecognizer(stream.RATE, s, t, k)
|
engine = GummyRecognizer(stream.RATE, s, t, k)
|
||||||
|
|
||||||
stream.open_stream()
|
|
||||||
engine.start()
|
engine.start()
|
||||||
chunk_mono = bytes()
|
stream_thread = threading.Thread(
|
||||||
|
target=audio_recording,
|
||||||
restart_count = 0
|
args=(stream, False),
|
||||||
while thread_data.status == "running":
|
daemon=True
|
||||||
try:
|
)
|
||||||
chunk = stream.read_chunk()
|
stream_thread.start()
|
||||||
if chunk is None: continue
|
try:
|
||||||
chunk_mono = merge_chunk_channels(chunk, stream.CHANNELS)
|
engine.translate()
|
||||||
try:
|
except KeyboardInterrupt:
|
||||||
engine.send_audio_frame(chunk_mono)
|
stdout("Keyboard interrupt detected. Exiting...")
|
||||||
except InvalidParameter as e:
|
|
||||||
restart_count += 1
|
|
||||||
if restart_count > 5:
|
|
||||||
stdout_err(str(e))
|
|
||||||
thread_data.status = "kill"
|
|
||||||
stdout_cmd('kill')
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
stdout_cmd('info', f'Gummy engine stopped, restart attempt: {restart_count}...')
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
break
|
|
||||||
|
|
||||||
engine.send_audio_frame(chunk_mono)
|
|
||||||
stream.close_stream()
|
|
||||||
engine.stop()
|
engine.stop()
|
||||||
|
|
||||||
|
|
||||||
def main_vosk(a: int, c: int, m: str, t: str, tm: str, on: str):
|
def main_vosk(a: int, c: int, vosk: str, t: str, tm: str, omn: str):
|
||||||
global thread_data
|
"""
|
||||||
|
Parameters:
|
||||||
|
a: Audio source: 0 for output, 1 for input
|
||||||
|
c: Chunk number in 1 second
|
||||||
|
vosk: Vosk model path
|
||||||
|
t: Target language
|
||||||
|
tm: Translation model type, ollama or google
|
||||||
|
omn: Ollama model name
|
||||||
|
"""
|
||||||
stream = AudioStream(a, c)
|
stream = AudioStream(a, c)
|
||||||
engine = VoskRecognizer(
|
if t == 'none':
|
||||||
m, None if t == 'none' else t,
|
engine = VoskRecognizer(vosk, None, tm, omn)
|
||||||
tm, on
|
else:
|
||||||
)
|
engine = VoskRecognizer(vosk, t, tm, omn)
|
||||||
|
|
||||||
stream.open_stream()
|
|
||||||
engine.start()
|
engine.start()
|
||||||
|
stream_thread = threading.Thread(
|
||||||
|
target=audio_recording,
|
||||||
|
args=(stream, True),
|
||||||
|
daemon=True
|
||||||
|
)
|
||||||
|
stream_thread.start()
|
||||||
|
try:
|
||||||
|
engine.translate()
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
stdout("Keyboard interrupt detected. Exiting...")
|
||||||
|
engine.stop()
|
||||||
|
|
||||||
while thread_data.status == "running":
|
|
||||||
try:
|
|
||||||
chunk = stream.read_chunk()
|
|
||||||
if chunk is None: continue
|
|
||||||
chunk_mono = resample_chunk_mono(chunk, stream.CHANNELS, stream.RATE, 16000)
|
|
||||||
engine.send_audio_frame(chunk_mono)
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
break
|
|
||||||
|
|
||||||
stream.close_stream()
|
def main_sosv(a: int, c: int, sosv: str, t: str, tm: str, omn: str):
|
||||||
|
"""
|
||||||
|
Parameters:
|
||||||
|
a: Audio source: 0 for output, 1 for input
|
||||||
|
c: Chunk number in 1 second
|
||||||
|
sosv: Sherpa-ONNX SenseVoice model path
|
||||||
|
t: Target language
|
||||||
|
tm: Translation model type, ollama or google
|
||||||
|
omn: Ollama model name
|
||||||
|
"""
|
||||||
|
stream = AudioStream(a, c)
|
||||||
|
if t == 'none':
|
||||||
|
engine = SosvRecognizer(sosv, None, tm, omn)
|
||||||
|
else:
|
||||||
|
engine = SosvRecognizer(sosv, t, tm, omn)
|
||||||
|
|
||||||
|
engine.start()
|
||||||
|
stream_thread = threading.Thread(
|
||||||
|
target=audio_recording,
|
||||||
|
args=(stream, True),
|
||||||
|
daemon=True
|
||||||
|
)
|
||||||
|
stream_thread.start()
|
||||||
|
try:
|
||||||
|
engine.translate()
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
stdout("Keyboard interrupt detected. Exiting...")
|
||||||
engine.stop()
|
engine.stop()
|
||||||
|
|
||||||
|
|
||||||
@@ -74,19 +124,22 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument('-e', '--caption_engine', default='gummy', help='Caption engine: gummy or vosk')
|
parser.add_argument('-e', '--caption_engine', default='gummy', help='Caption engine: gummy or vosk')
|
||||||
parser.add_argument('-a', '--audio_type', default=0, help='Audio stream source: 0 for output, 1 for input')
|
parser.add_argument('-a', '--audio_type', default=0, help='Audio stream source: 0 for output, 1 for input')
|
||||||
parser.add_argument('-c', '--chunk_rate', default=10, help='Number of audio stream chunks collected per second')
|
parser.add_argument('-c', '--chunk_rate', default=10, help='Number of audio stream chunks collected per second')
|
||||||
parser.add_argument('-p', '--port', default=8080, help='The port to run the server on, 0 for no server')
|
parser.add_argument('-p', '--port', default=0, help='The port to run the server on, 0 for no server')
|
||||||
parser.add_argument('-t', '--target_language', default='zh', help='Target language code, "none" for no translation')
|
parser.add_argument('-t', '--target_language', default='zh', help='Target language code, "none" for no translation')
|
||||||
# gummy only
|
# gummy only
|
||||||
parser.add_argument('-s', '--source_language', default='en', help='Source language code')
|
parser.add_argument('-s', '--source_language', default='en', help='Source language code')
|
||||||
parser.add_argument('-k', '--api_key', default='', help='API KEY for Gummy model')
|
parser.add_argument('-k', '--api_key', default='', help='API KEY for Gummy model')
|
||||||
|
# vosk and sosv
|
||||||
|
parser.add_argument('-tm', '--translation_model', default='ollama', help='Model for translation: ollama or google')
|
||||||
|
parser.add_argument('-omn', '--ollama_name', default='', help='Ollama model name for translation')
|
||||||
# vosk only
|
# vosk only
|
||||||
parser.add_argument('-m', '--model_path', default='', help='The path to the vosk model.')
|
parser.add_argument('-vosk', '--vosk_model', default='', help='The path to the vosk model.')
|
||||||
parser.add_argument('-tm', '--translation_model', default='', help='Google translate API KEY')
|
# sosv only
|
||||||
parser.add_argument('-on', '--ollama_name', default='', help='Ollama model name for translation')
|
parser.add_argument('-sosv', '--sosv_model', default=None, help='The SenseVoice model path')
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if int(args.port) == 0:
|
if int(args.port) == 0:
|
||||||
thread_data.status = "running"
|
shared_data.status = "running"
|
||||||
else:
|
else:
|
||||||
start_server(int(args.port))
|
start_server(int(args.port))
|
||||||
|
|
||||||
@@ -102,7 +155,16 @@ if __name__ == "__main__":
|
|||||||
main_vosk(
|
main_vosk(
|
||||||
int(args.audio_type),
|
int(args.audio_type),
|
||||||
int(args.chunk_rate),
|
int(args.chunk_rate),
|
||||||
args.model_path,
|
args.vosk_model,
|
||||||
|
args.target_language,
|
||||||
|
args.translation_model,
|
||||||
|
args.ollama_name
|
||||||
|
)
|
||||||
|
elif args.caption_engine == 'sosv':
|
||||||
|
main_sosv(
|
||||||
|
int(args.audio_type),
|
||||||
|
int(args.chunk_rate),
|
||||||
|
args.sosv_model,
|
||||||
args.target_language,
|
args.target_language,
|
||||||
args.translation_model,
|
args.translation_model,
|
||||||
args.ollama_name
|
args.ollama_name
|
||||||
@@ -110,5 +172,5 @@ if __name__ == "__main__":
|
|||||||
else:
|
else:
|
||||||
raise ValueError('Invalid caption engine specified.')
|
raise ValueError('Invalid caption engine specified.')
|
||||||
|
|
||||||
if thread_data.status == "kill":
|
if shared_data.status == "kill":
|
||||||
stdout_cmd('kill')
|
stdout_cmd('kill')
|
||||||
|
|||||||
@@ -37,14 +37,13 @@ class AudioStream:
|
|||||||
self.FORMAT = pyaudio.paInt16
|
self.FORMAT = pyaudio.paInt16
|
||||||
self.SAMP_WIDTH = pyaudio.get_sample_size(self.FORMAT)
|
self.SAMP_WIDTH = pyaudio.get_sample_size(self.FORMAT)
|
||||||
self.CHANNELS = int(self.device["maxInputChannels"])
|
self.CHANNELS = int(self.device["maxInputChannels"])
|
||||||
self.RATE = int(self.device["defaultSampleRate"])
|
self.DEFAULT_RATE = int(self.device["defaultSampleRate"])
|
||||||
self.CHUNK = self.RATE // chunk_rate
|
self.CHUNK_RATE = chunk_rate
|
||||||
|
|
||||||
def reset_chunk_size(self, chunk_size: int):
|
self.RATE = 16000
|
||||||
"""
|
self.CHUNK = self.RATE // self.CHUNK_RATE
|
||||||
重新设置音频块大小
|
self.open_stream()
|
||||||
"""
|
self.close_stream()
|
||||||
self.CHUNK = chunk_size
|
|
||||||
|
|
||||||
def get_info(self):
|
def get_info(self):
|
||||||
dev_info = f"""
|
dev_info = f"""
|
||||||
@@ -72,16 +71,27 @@ class AudioStream:
|
|||||||
打开并返回系统音频输出流
|
打开并返回系统音频输出流
|
||||||
"""
|
"""
|
||||||
if self.stream: return self.stream
|
if self.stream: return self.stream
|
||||||
self.stream = self.mic.open(
|
try:
|
||||||
format = self.FORMAT,
|
self.stream = self.mic.open(
|
||||||
channels = int(self.CHANNELS),
|
format = self.FORMAT,
|
||||||
rate = self.RATE,
|
channels = int(self.CHANNELS),
|
||||||
input = True,
|
rate = self.RATE,
|
||||||
input_device_index = int(self.INDEX)
|
input = True,
|
||||||
)
|
input_device_index = int(self.INDEX)
|
||||||
|
)
|
||||||
|
except OSError:
|
||||||
|
self.RATE = self.DEFAULT_RATE
|
||||||
|
self.CHUNK = self.RATE // self.CHUNK_RATE
|
||||||
|
self.stream = self.mic.open(
|
||||||
|
format = self.FORMAT,
|
||||||
|
channels = int(self.CHANNELS),
|
||||||
|
rate = self.RATE,
|
||||||
|
input = True,
|
||||||
|
input_device_index = int(self.INDEX)
|
||||||
|
)
|
||||||
return self.stream
|
return self.stream
|
||||||
|
|
||||||
def read_chunk(self):
|
def read_chunk(self) -> bytes | None:
|
||||||
"""
|
"""
|
||||||
读取音频数据
|
读取音频数据
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -55,15 +55,10 @@ class AudioStream:
|
|||||||
self.FORMAT = 16
|
self.FORMAT = 16
|
||||||
self.SAMP_WIDTH = 2
|
self.SAMP_WIDTH = 2
|
||||||
self.CHANNELS = 2
|
self.CHANNELS = 2
|
||||||
self.RATE = 48000
|
self.RATE = 16000
|
||||||
|
self.CHUNK_RATE = chunk_rate
|
||||||
self.CHUNK = self.RATE // chunk_rate
|
self.CHUNK = self.RATE // chunk_rate
|
||||||
|
|
||||||
def reset_chunk_size(self, chunk_size: int):
|
|
||||||
"""
|
|
||||||
重新设置音频块大小
|
|
||||||
"""
|
|
||||||
self.CHUNK = chunk_size
|
|
||||||
|
|
||||||
def get_info(self):
|
def get_info(self):
|
||||||
dev_info = f"""
|
dev_info = f"""
|
||||||
音频捕获进程:
|
音频捕获进程:
|
||||||
@@ -84,7 +79,7 @@ class AudioStream:
|
|||||||
启动音频捕获进程
|
启动音频捕获进程
|
||||||
"""
|
"""
|
||||||
self.process = subprocess.Popen(
|
self.process = subprocess.Popen(
|
||||||
["parec", "-d", self.source, "--format=s16le", "--rate=48000", "--channels=2"],
|
["parec", "-d", self.source, "--format=s16le", "--rate=16000", "--channels=2"],
|
||||||
stdout=subprocess.PIPE
|
stdout=subprocess.PIPE
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -61,14 +61,13 @@ class AudioStream:
|
|||||||
self.FORMAT = pyaudio.paInt16
|
self.FORMAT = pyaudio.paInt16
|
||||||
self.SAMP_WIDTH = pyaudio.get_sample_size(self.FORMAT)
|
self.SAMP_WIDTH = pyaudio.get_sample_size(self.FORMAT)
|
||||||
self.CHANNELS = int(self.device["maxInputChannels"])
|
self.CHANNELS = int(self.device["maxInputChannels"])
|
||||||
self.RATE = int(self.device["defaultSampleRate"])
|
self.DEFAULT_RATE = int(self.device["defaultSampleRate"])
|
||||||
self.CHUNK = self.RATE // chunk_rate
|
self.CHUNK_RATE = chunk_rate
|
||||||
|
|
||||||
def reset_chunk_size(self, chunk_size: int):
|
self.RATE = 16000
|
||||||
"""
|
self.CHUNK = self.RATE // self.CHUNK_RATE
|
||||||
重新设置音频块大小
|
self.open_stream()
|
||||||
"""
|
self.close_stream()
|
||||||
self.CHUNK = chunk_size
|
|
||||||
|
|
||||||
def get_info(self):
|
def get_info(self):
|
||||||
dev_info = f"""
|
dev_info = f"""
|
||||||
@@ -96,13 +95,24 @@ class AudioStream:
|
|||||||
打开并返回系统音频输出流
|
打开并返回系统音频输出流
|
||||||
"""
|
"""
|
||||||
if self.stream: return self.stream
|
if self.stream: return self.stream
|
||||||
self.stream = self.mic.open(
|
try:
|
||||||
format = self.FORMAT,
|
self.stream = self.mic.open(
|
||||||
channels = self.CHANNELS,
|
format = self.FORMAT,
|
||||||
rate = self.RATE,
|
channels = self.CHANNELS,
|
||||||
input = True,
|
rate = self.RATE,
|
||||||
input_device_index = self.INDEX
|
input = True,
|
||||||
)
|
input_device_index = self.INDEX
|
||||||
|
)
|
||||||
|
except OSError:
|
||||||
|
self.RATE = self.DEFAULT_RATE
|
||||||
|
self.CHUNK = self.RATE // self.CHUNK_RATE
|
||||||
|
self.stream = self.mic.open(
|
||||||
|
format = self.FORMAT,
|
||||||
|
channels = self.CHANNELS,
|
||||||
|
rate = self.RATE,
|
||||||
|
input = True,
|
||||||
|
input_device_index = self.INDEX
|
||||||
|
)
|
||||||
return self.stream
|
return self.stream
|
||||||
|
|
||||||
def read_chunk(self) -> bytes | None:
|
def read_chunk(self) -> bytes | None:
|
||||||
|
|||||||
@@ -5,6 +5,6 @@ from .audioprcs import (
|
|||||||
resample_mono_chunk
|
resample_mono_chunk
|
||||||
)
|
)
|
||||||
from .sysout import stdout, stdout_err, stdout_cmd, stdout_obj, stderr
|
from .sysout import stdout, stdout_err, stdout_cmd, stdout_obj, stderr
|
||||||
from .thdata import thread_data
|
from .shared import shared_data
|
||||||
from .server import start_server
|
from .server import start_server
|
||||||
from .translation import ollama_translate, google_translate
|
from .translation import ollama_translate, google_translate
|
||||||
@@ -49,9 +49,18 @@ def resample_chunk_mono(chunk: bytes, channels: int, orig_sr: int, target_sr: in
|
|||||||
# (length,)
|
# (length,)
|
||||||
chunk_mono = np.mean(chunk_np.astype(np.float32), axis=1)
|
chunk_mono = np.mean(chunk_np.astype(np.float32), axis=1)
|
||||||
|
|
||||||
|
if orig_sr == target_sr:
|
||||||
|
return chunk_mono.astype(np.int16).tobytes()
|
||||||
|
|
||||||
ratio = target_sr / orig_sr
|
ratio = target_sr / orig_sr
|
||||||
chunk_mono_r = samplerate.resample(chunk_mono, ratio, converter_type=mode)
|
chunk_mono_r = samplerate.resample(chunk_mono, ratio, converter_type=mode)
|
||||||
chunk_mono_r = np.round(chunk_mono_r).astype(np.int16)
|
chunk_mono_r = np.round(chunk_mono_r).astype(np.int16)
|
||||||
|
real_len = round(chunk_mono.shape[0] * ratio)
|
||||||
|
if(chunk_mono_r.shape[0] > real_len):
|
||||||
|
chunk_mono_r = chunk_mono_r[:real_len]
|
||||||
|
else:
|
||||||
|
while chunk_mono_r.shape[0] < real_len:
|
||||||
|
chunk_mono_r = np.append(chunk_mono_r, chunk_mono_r[-1])
|
||||||
return chunk_mono_r.tobytes()
|
return chunk_mono_r.tobytes()
|
||||||
|
|
||||||
|
|
||||||
@@ -81,9 +90,18 @@ def resample_chunk_mono_np(chunk: bytes, channels: int, orig_sr: int, target_sr:
|
|||||||
# (length,)
|
# (length,)
|
||||||
chunk_mono = np.mean(chunk_np.astype(np.float32), axis=1)
|
chunk_mono = np.mean(chunk_np.astype(np.float32), axis=1)
|
||||||
|
|
||||||
|
if orig_sr == target_sr:
|
||||||
|
return chunk_mono.astype(dtype)
|
||||||
|
|
||||||
ratio = target_sr / orig_sr
|
ratio = target_sr / orig_sr
|
||||||
chunk_mono_r = samplerate.resample(chunk_mono, ratio, converter_type=mode)
|
chunk_mono_r = samplerate.resample(chunk_mono, ratio, converter_type=mode)
|
||||||
chunk_mono_r = chunk_mono_r.astype(dtype)
|
chunk_mono_r = chunk_mono_r.astype(dtype)
|
||||||
|
real_len = round(chunk_mono.shape[0] * ratio)
|
||||||
|
if(chunk_mono_r.shape[0] > real_len):
|
||||||
|
chunk_mono_r = chunk_mono_r[:real_len]
|
||||||
|
else:
|
||||||
|
while chunk_mono_r.shape[0] < real_len:
|
||||||
|
chunk_mono_r = np.append(chunk_mono_r, chunk_mono_r[-1])
|
||||||
return chunk_mono_r
|
return chunk_mono_r
|
||||||
|
|
||||||
|
|
||||||
@@ -100,9 +118,16 @@ def resample_mono_chunk(chunk: bytes, orig_sr: int, target_sr: int, mode="sinc_b
|
|||||||
Return:
|
Return:
|
||||||
单通道音频数据块
|
单通道音频数据块
|
||||||
"""
|
"""
|
||||||
|
if orig_sr == target_sr: return chunk
|
||||||
chunk_np = np.frombuffer(chunk, dtype=np.int16)
|
chunk_np = np.frombuffer(chunk, dtype=np.int16)
|
||||||
chunk_np = chunk_np.astype(np.float32)
|
chunk_np = chunk_np.astype(np.float32)
|
||||||
ratio = target_sr / orig_sr
|
ratio = target_sr / orig_sr
|
||||||
chunk_r = samplerate.resample(chunk_np, ratio, converter_type=mode)
|
chunk_r = samplerate.resample(chunk_np, ratio, converter_type=mode)
|
||||||
chunk_r = np.round(chunk_r).astype(np.int16)
|
chunk_r = np.round(chunk_r).astype(np.int16)
|
||||||
|
real_len = round(chunk_np.shape[0] * ratio)
|
||||||
|
if(chunk_r.shape[0] > real_len):
|
||||||
|
chunk_r = chunk_r[:real_len]
|
||||||
|
else:
|
||||||
|
while chunk_r.shape[0] < real_len:
|
||||||
|
chunk_r = np.append(chunk_r, chunk_r[-1])
|
||||||
return chunk_r.tobytes()
|
return chunk_r.tobytes()
|
||||||
|
|||||||
@@ -1,13 +1,12 @@
|
|||||||
import socket
|
import socket
|
||||||
import threading
|
import threading
|
||||||
import json
|
import json
|
||||||
# import time
|
from utils import shared_data, stdout_cmd, stderr
|
||||||
from utils import thread_data, stdout_cmd, stderr
|
|
||||||
|
|
||||||
|
|
||||||
def handle_client(client_socket):
|
def handle_client(client_socket):
|
||||||
global thread_data
|
global shared_data
|
||||||
while thread_data.status == 'running':
|
while shared_data.status == 'running':
|
||||||
try:
|
try:
|
||||||
data = client_socket.recv(4096).decode('utf-8')
|
data = client_socket.recv(4096).decode('utf-8')
|
||||||
if not data:
|
if not data:
|
||||||
@@ -15,13 +14,13 @@ def handle_client(client_socket):
|
|||||||
data = json.loads(data)
|
data = json.loads(data)
|
||||||
|
|
||||||
if data['command'] == 'stop':
|
if data['command'] == 'stop':
|
||||||
thread_data.status = 'stop'
|
shared_data.status = 'stop'
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
stderr(f'Communication error: {e}')
|
stderr(f'Communication error: {e}')
|
||||||
break
|
break
|
||||||
|
|
||||||
thread_data.status = 'stop'
|
shared_data.status = 'stop'
|
||||||
client_socket.close()
|
client_socket.close()
|
||||||
|
|
||||||
|
|
||||||
@@ -34,7 +33,6 @@ def start_server(port: int):
|
|||||||
stderr(str(e))
|
stderr(str(e))
|
||||||
stdout_cmd('kill')
|
stdout_cmd('kill')
|
||||||
return
|
return
|
||||||
# time.sleep(20)
|
|
||||||
stdout_cmd('connect')
|
stdout_cmd('connect')
|
||||||
|
|
||||||
client, addr = server.accept()
|
client, addr = server.accept()
|
||||||
|
|||||||
8
engine/utils/shared.py
Normal file
8
engine/utils/shared.py
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
import queue
|
||||||
|
|
||||||
|
class SharedData:
|
||||||
|
def __init__(self):
|
||||||
|
self.status = "running"
|
||||||
|
self.chunk_queue = queue.Queue()
|
||||||
|
|
||||||
|
shared_data = SharedData()
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
class ThreadData:
|
|
||||||
def __init__(self):
|
|
||||||
self.status = "running"
|
|
||||||
|
|
||||||
thread_data = ThreadData()
|
|
||||||
@@ -81,9 +81,9 @@ export class CaptionEngine {
|
|||||||
}
|
}
|
||||||
else if(allConfig.controls.engine === 'vosk'){
|
else if(allConfig.controls.engine === 'vosk'){
|
||||||
this.command.push('-e', 'vosk')
|
this.command.push('-e', 'vosk')
|
||||||
this.command.push('-m', `"${allConfig.controls.modelPath}"`)
|
this.command.push('-vosk', `"${allConfig.controls.modelPath}"`)
|
||||||
this.command.push('-tm', allConfig.controls.transModel)
|
this.command.push('-tm', allConfig.controls.transModel)
|
||||||
this.command.push('-on', allConfig.controls.ollamaName)
|
this.command.push('-omn', allConfig.controls.ollamaName)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Log.info('Engine Path:', this.appPath)
|
Log.info('Engine Path:', this.appPath)
|
||||||
|
|||||||
Reference in New Issue
Block a user