diff --git a/docs/api-docs/caption-engine.md b/docs/api-docs/caption-engine.md index be2f0ad..88cb100 100644 --- a/docs/api-docs/caption-engine.md +++ b/docs/api-docs/caption-engine.md @@ -64,6 +64,7 @@ Python 端监听到的音频流转换为的字幕数据。 { command: "translation", time_s: string, + text: string, translation: string } ``` diff --git a/engine/main.py b/engine/main.py index fba4845..76db7e8 100644 --- a/engine/main.py +++ b/engine/main.py @@ -2,7 +2,7 @@ import wave import argparse import threading import datetime -from utils import stdout, stdout_cmd +from utils import stdout, stdout_cmd, change_caption_display from utils import shared_data, start_server from utils import merge_chunk_channels, resample_chunk_mono from audio2text import GummyRecognizer @@ -142,11 +142,12 @@ def main_sosv(a: int, c: int, sosv: str, s: str, t: str, tm: str, omn: str, r: b if __name__ == "__main__": parser = argparse.ArgumentParser(description='Convert system audio stream to text') # all - parser.add_argument('-e', '--caption_engine', default='gummy', help='Caption engine: gummy or vosk') + parser.add_argument('-e', '--caption_engine', default='gummy', help='Caption engine: gummy or vosk or sosv') parser.add_argument('-a', '--audio_type', default=0, help='Audio stream source: 0 for output, 1 for input') parser.add_argument('-c', '--chunk_rate', default=10, help='Number of audio stream chunks collected per second') parser.add_argument('-p', '--port', default=0, help='The port to run the server on, 0 for no server') - parser.add_argument('-t', '--target_language', default='zh', help='Target language code, "none" for no translation') + parser.add_argument('-d', '--display_caption', default=0, help='Display caption on terminal, 0 for no display, 1 for display') + parser.add_argument('-t', '--target_language', default='none', help='Target language code, "none" for no translation') parser.add_argument('-r', '--record', default=0, help='Whether to record the audio, 0 for no recording, 1 for recording') parser.add_argument('-rp', '--record_path', default='', help='Path to save the recorded audio') # gummy and sosv @@ -167,6 +168,10 @@ if __name__ == "__main__": else: start_server(int(args.port)) + if int(args.display_caption) != 0: + change_caption_display(True) + print("Caption will be displayed on terminal") + if args.caption_engine == 'gummy': main_gummy( args.source_language, diff --git a/engine/utils/__init__.py b/engine/utils/__init__.py index 2acdad8..aaf2b4a 100644 --- a/engine/utils/__init__.py +++ b/engine/utils/__init__.py @@ -1,5 +1,6 @@ from .audioprcs import merge_chunk_channels, resample_chunk_mono from .sysout import stdout, stdout_err, stdout_cmd, stdout_obj, stderr +from .sysout import change_caption_display from .shared import shared_data from .server import start_server from .translation import ollama_translate, google_translate \ No newline at end of file diff --git a/engine/utils/sysout.py b/engine/utils/sysout.py index 96aa040..2c50fc9 100644 --- a/engine/utils/sysout.py +++ b/engine/utils/sysout.py @@ -1,5 +1,10 @@ import sys import json +import sherpa_onnx + +display_caption = False +caption_index = -1 +display = sherpa_onnx.Display() def stdout(text: str): stdout_cmd("print", text) @@ -12,7 +17,39 @@ def stdout_cmd(command: str, content = ""): sys.stdout.write(json.dumps(msg) + "\n") sys.stdout.flush() +def change_caption_display(val: bool): + global display_caption + display_caption = val + +def caption_display(obj): + global display_caption + global caption_index + global display + + if caption_index >=0 and caption_index != int(obj['index']): + display.finalize_current_sentence() + caption_index = int(obj['index']) + full_text = f"{obj['text']} {obj['translation']}" + display.update_text(full_text) + display.display() + +def translation_display(obj): + global original_caption + global display + full_text = f"{obj['text']} {obj['translation']}" + display.update_text(full_text) + display.display() + display.finalize_current_sentence() + def stdout_obj(obj): + global display_caption + print(obj['command'], display_caption) + if obj['command'] == 'caption' and display_caption: + caption_display(obj) + return + if obj['command'] == 'translation' and display_caption: + translation_display(obj) + return sys.stdout.write(json.dumps(obj) + "\n") sys.stdout.flush() diff --git a/engine/utils/translation.py b/engine/utils/translation.py index ed3215d..b65b6c2 100644 --- a/engine/utils/translation.py +++ b/engine/utils/translation.py @@ -33,6 +33,7 @@ def ollama_translate(model: str, target: str, text: str, time_s: str): stdout_obj({ "command": "translation", "time_s": time_s, + "text": text, "translation": content.strip() }) @@ -43,6 +44,7 @@ def google_translate(model: str, target: str, text: str, time_s: str): stdout_obj({ "command": "translation", "time_s": time_s, + "text": text, "translation": res.text }) except Exception as e: