mirror of
https://github.com/HiMeditator/auto-caption.git
synced 2026-02-13 02:54:48 +08:00
feat(engine): 优化字幕引擎输出格式、准备合并两个字幕引擎
- 重构字幕引擎相关代码 - 准备合并两个字幕引擎
This commit is contained in:
@@ -15,18 +15,20 @@ class Callback(TranslationRecognizerCallback):
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.index = 0
|
||||
self.usage = 0
|
||||
self.cur_id = -1
|
||||
self.index = 0
|
||||
self.time_str = ''
|
||||
|
||||
def on_open(self) -> None:
|
||||
self.usage = 0
|
||||
self.cur_id = -1
|
||||
self.time_str = ''
|
||||
stdout_cmd('info', 'Gummy translator started.')
|
||||
|
||||
def on_close(self) -> None:
|
||||
stdout_cmd('info', 'Gummy translator closed.')
|
||||
stdout_cmd('usage', str(self.usage))
|
||||
|
||||
def on_event(
|
||||
self,
|
||||
@@ -46,7 +48,6 @@ class Callback(TranslationRecognizerCallback):
|
||||
caption['index'] = self.index
|
||||
caption['time_s'] = self.time_str
|
||||
caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3]
|
||||
caption['end'] = transcription_result.is_sentence_end
|
||||
caption['text'] = transcription_result.text
|
||||
caption['translation'] = ""
|
||||
|
||||
@@ -57,7 +58,8 @@ class Callback(TranslationRecognizerCallback):
|
||||
if usage:
|
||||
self.usage += usage['duration']
|
||||
|
||||
stdout_obj(caption)
|
||||
if 'text' in caption:
|
||||
stdout_obj(caption)
|
||||
|
||||
|
||||
class GummyTranslator:
|
||||
@@ -88,7 +90,7 @@ class GummyTranslator:
|
||||
self.translator.start()
|
||||
|
||||
def send_audio_frame(self, data):
|
||||
"""发送音频帧"""
|
||||
"""发送音频帧,擎将自动识别并将识别结果输出到标准输出中"""
|
||||
self.translator.send_audio_frame(data)
|
||||
|
||||
def stop(self):
|
||||
|
||||
59
engine/audio2text/vosk.py
Normal file
59
engine/audio2text/vosk.py
Normal file
@@ -0,0 +1,59 @@
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
from vosk import Model, KaldiRecognizer, SetLogLevel
|
||||
from utils import stdout_obj
|
||||
|
||||
class VoskRecognizer:
|
||||
"""
|
||||
使用 Vosk 引擎流式处理的音频数据,并在标准输出中输出与 Auto Caption 软件可读取的 JSON 字符串数据
|
||||
|
||||
初始化参数:
|
||||
model_path: Vosk 识别模型路径
|
||||
"""
|
||||
def __int__(self, model_path: str):
|
||||
SetLogLevel(-1)
|
||||
if model_path.startswith('"'):
|
||||
model_path = model_path[1:]
|
||||
if model_path.endswith('"'):
|
||||
model_path = model_path[:-1]
|
||||
self.model_path = model_path
|
||||
self.time_str = ''
|
||||
self.cur_id = 0
|
||||
self.prev_content = ''
|
||||
|
||||
self.model = Model(self.model_path)
|
||||
self.recognizer = KaldiRecognizer(self.model, 16000)
|
||||
|
||||
def send_audio_frame(self, data: bytes):
|
||||
"""
|
||||
发送音频帧给 Vosk 引擎,引擎将自动识别并将识别结果输出到标准输出中
|
||||
|
||||
Args:
|
||||
data: 音频帧数据,采样率必须为 16000Hz
|
||||
"""
|
||||
caption = {}
|
||||
caption['command'] = 'caption'
|
||||
caption['translation'] = ''
|
||||
|
||||
if self.recognizer.AcceptWaveform(data):
|
||||
content = json.loads(self.recognizer.Result()).get('text', '')
|
||||
caption['index'] = self.cur_id
|
||||
caption['text'] = content
|
||||
caption['time_s'] = self.time_str
|
||||
caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3]
|
||||
self.prev_content = ''
|
||||
self.cur_id += 1
|
||||
else:
|
||||
content = json.loads(self.recognizer.PartialResult()).get('partial', '')
|
||||
if content == '' or content == self.prev_content:
|
||||
return
|
||||
if self.prev_content == '':
|
||||
self.time_str = datetime.now().strftime('%H:%M:%S.%f')[:-3]
|
||||
caption['index'] = self.cur_id
|
||||
caption['text'] = content
|
||||
caption['time_s'] = self.time_str
|
||||
caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3]
|
||||
self.prev_content = content
|
||||
|
||||
stdout_obj(caption)
|
||||
@@ -49,6 +49,7 @@ def convert_audio_to_text(audio_type, chunk_rate, model_path):
|
||||
continue
|
||||
if prev_content == '':
|
||||
time_str = datetime.now().strftime('%H:%M:%S.%f')[:-3]
|
||||
caption['command'] = 'caption'
|
||||
caption['index'] = cur_id
|
||||
caption['text'] = content
|
||||
caption['time_s'] = time_str
|
||||
|
||||
37
engine/main.py
Normal file
37
engine/main.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import argparse
|
||||
|
||||
def gummy_engine(s, t, a, c, k):
|
||||
pass
|
||||
|
||||
def vosk_engine(a, c, m):
|
||||
pass
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description='Convert system audio stream to text')
|
||||
# both
|
||||
parser.add_argument('-e', '--caption_engine', default='gummy', help='Caption engine: gummy or vosk')
|
||||
parser.add_argument('-a', '--audio_type', default=0, help='Audio stream source: 0 for output, 1 for input')
|
||||
parser.add_argument('-c', '--chunk_rate', default=20, help='Number of audio stream chunks collected per second')
|
||||
# gummy
|
||||
parser.add_argument('-s', '--source_language', default='en', help='Source language code')
|
||||
parser.add_argument('-t', '--target_language', default='zh', help='Target language code')
|
||||
parser.add_argument('-k', '--api_key', default='', help='API KEY for Gummy model')
|
||||
# vosk
|
||||
parser.add_argument('-m', '--model_path', default='', help='The path to the vosk model.')
|
||||
args = parser.parse_args()
|
||||
if args.caption_engine == 'gummy':
|
||||
gummy_engine(
|
||||
args.source_language,
|
||||
args.target_language,
|
||||
int(args.audio_type),
|
||||
int(args.chunk_rate),
|
||||
args.api_key
|
||||
)
|
||||
elif args.caption_engine == 'vosk':
|
||||
vosk_engine(
|
||||
int(args.audio_type),
|
||||
int(args.chunk_rate),
|
||||
args.model_path
|
||||
)
|
||||
else:
|
||||
raise ValueError('Invalid caption engine specified.')
|
||||
Reference in New Issue
Block a user