mirror of
https://github.com/HiMeditator/auto-caption.git
synced 2026-05-18 03:27:29 +08:00
feat(engine): 字幕引擎添加在终端直接显示字幕的功能
This commit is contained in:
@@ -64,6 +64,7 @@ Python 端监听到的音频流转换为的字幕数据。
|
|||||||
{
|
{
|
||||||
command: "translation",
|
command: "translation",
|
||||||
time_s: string,
|
time_s: string,
|
||||||
|
text: string,
|
||||||
translation: string
|
translation: string
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import wave
|
|||||||
import argparse
|
import argparse
|
||||||
import threading
|
import threading
|
||||||
import datetime
|
import datetime
|
||||||
from utils import stdout, stdout_cmd
|
from utils import stdout, stdout_cmd, change_caption_display
|
||||||
from utils import shared_data, start_server
|
from utils import shared_data, start_server
|
||||||
from utils import merge_chunk_channels, resample_chunk_mono
|
from utils import merge_chunk_channels, resample_chunk_mono
|
||||||
from audio2text import GummyRecognizer
|
from audio2text import GummyRecognizer
|
||||||
@@ -142,11 +142,12 @@ def main_sosv(a: int, c: int, sosv: str, s: str, t: str, tm: str, omn: str, r: b
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(description='Convert system audio stream to text')
|
parser = argparse.ArgumentParser(description='Convert system audio stream to text')
|
||||||
# all
|
# all
|
||||||
parser.add_argument('-e', '--caption_engine', default='gummy', help='Caption engine: gummy or vosk')
|
parser.add_argument('-e', '--caption_engine', default='gummy', help='Caption engine: gummy or vosk or sosv')
|
||||||
parser.add_argument('-a', '--audio_type', default=0, help='Audio stream source: 0 for output, 1 for input')
|
parser.add_argument('-a', '--audio_type', default=0, help='Audio stream source: 0 for output, 1 for input')
|
||||||
parser.add_argument('-c', '--chunk_rate', default=10, help='Number of audio stream chunks collected per second')
|
parser.add_argument('-c', '--chunk_rate', default=10, help='Number of audio stream chunks collected per second')
|
||||||
parser.add_argument('-p', '--port', default=0, help='The port to run the server on, 0 for no server')
|
parser.add_argument('-p', '--port', default=0, help='The port to run the server on, 0 for no server')
|
||||||
parser.add_argument('-t', '--target_language', default='zh', help='Target language code, "none" for no translation')
|
parser.add_argument('-d', '--display_caption', default=0, help='Display caption on terminal, 0 for no display, 1 for display')
|
||||||
|
parser.add_argument('-t', '--target_language', default='none', help='Target language code, "none" for no translation')
|
||||||
parser.add_argument('-r', '--record', default=0, help='Whether to record the audio, 0 for no recording, 1 for recording')
|
parser.add_argument('-r', '--record', default=0, help='Whether to record the audio, 0 for no recording, 1 for recording')
|
||||||
parser.add_argument('-rp', '--record_path', default='', help='Path to save the recorded audio')
|
parser.add_argument('-rp', '--record_path', default='', help='Path to save the recorded audio')
|
||||||
# gummy and sosv
|
# gummy and sosv
|
||||||
@@ -167,6 +168,10 @@ if __name__ == "__main__":
|
|||||||
else:
|
else:
|
||||||
start_server(int(args.port))
|
start_server(int(args.port))
|
||||||
|
|
||||||
|
if int(args.display_caption) != 0:
|
||||||
|
change_caption_display(True)
|
||||||
|
print("Caption will be displayed on terminal")
|
||||||
|
|
||||||
if args.caption_engine == 'gummy':
|
if args.caption_engine == 'gummy':
|
||||||
main_gummy(
|
main_gummy(
|
||||||
args.source_language,
|
args.source_language,
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
from .audioprcs import merge_chunk_channels, resample_chunk_mono
|
from .audioprcs import merge_chunk_channels, resample_chunk_mono
|
||||||
from .sysout import stdout, stdout_err, stdout_cmd, stdout_obj, stderr
|
from .sysout import stdout, stdout_err, stdout_cmd, stdout_obj, stderr
|
||||||
|
from .sysout import change_caption_display
|
||||||
from .shared import shared_data
|
from .shared import shared_data
|
||||||
from .server import start_server
|
from .server import start_server
|
||||||
from .translation import ollama_translate, google_translate
|
from .translation import ollama_translate, google_translate
|
||||||
@@ -1,5 +1,10 @@
|
|||||||
import sys
|
import sys
|
||||||
import json
|
import json
|
||||||
|
import sherpa_onnx
|
||||||
|
|
||||||
|
display_caption = False
|
||||||
|
caption_index = -1
|
||||||
|
display = sherpa_onnx.Display()
|
||||||
|
|
||||||
def stdout(text: str):
|
def stdout(text: str):
|
||||||
stdout_cmd("print", text)
|
stdout_cmd("print", text)
|
||||||
@@ -12,7 +17,39 @@ def stdout_cmd(command: str, content = ""):
|
|||||||
sys.stdout.write(json.dumps(msg) + "\n")
|
sys.stdout.write(json.dumps(msg) + "\n")
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
|
|
||||||
|
def change_caption_display(val: bool):
|
||||||
|
global display_caption
|
||||||
|
display_caption = val
|
||||||
|
|
||||||
|
def caption_display(obj):
|
||||||
|
global display_caption
|
||||||
|
global caption_index
|
||||||
|
global display
|
||||||
|
|
||||||
|
if caption_index >=0 and caption_index != int(obj['index']):
|
||||||
|
display.finalize_current_sentence()
|
||||||
|
caption_index = int(obj['index'])
|
||||||
|
full_text = f"{obj['text']} {obj['translation']}"
|
||||||
|
display.update_text(full_text)
|
||||||
|
display.display()
|
||||||
|
|
||||||
|
def translation_display(obj):
|
||||||
|
global original_caption
|
||||||
|
global display
|
||||||
|
full_text = f"{obj['text']} {obj['translation']}"
|
||||||
|
display.update_text(full_text)
|
||||||
|
display.display()
|
||||||
|
display.finalize_current_sentence()
|
||||||
|
|
||||||
def stdout_obj(obj):
|
def stdout_obj(obj):
|
||||||
|
global display_caption
|
||||||
|
print(obj['command'], display_caption)
|
||||||
|
if obj['command'] == 'caption' and display_caption:
|
||||||
|
caption_display(obj)
|
||||||
|
return
|
||||||
|
if obj['command'] == 'translation' and display_caption:
|
||||||
|
translation_display(obj)
|
||||||
|
return
|
||||||
sys.stdout.write(json.dumps(obj) + "\n")
|
sys.stdout.write(json.dumps(obj) + "\n")
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
|
|
||||||
|
|||||||
@@ -33,6 +33,7 @@ def ollama_translate(model: str, target: str, text: str, time_s: str):
|
|||||||
stdout_obj({
|
stdout_obj({
|
||||||
"command": "translation",
|
"command": "translation",
|
||||||
"time_s": time_s,
|
"time_s": time_s,
|
||||||
|
"text": text,
|
||||||
"translation": content.strip()
|
"translation": content.strip()
|
||||||
})
|
})
|
||||||
|
|
||||||
@@ -43,6 +44,7 @@ def google_translate(model: str, target: str, text: str, time_s: str):
|
|||||||
stdout_obj({
|
stdout_obj({
|
||||||
"command": "translation",
|
"command": "translation",
|
||||||
"time_s": time_s,
|
"time_s": time_s,
|
||||||
|
"text": text,
|
||||||
"translation": res.text
|
"translation": res.text
|
||||||
})
|
})
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
Reference in New Issue
Block a user