auto-caption/engine/main.py

import argparse
from utils import stdout_cmd, stdout_err
from utils import thread_data, start_server
from utils import merge_chunk_channels, resample_chunk_mono
from audio2text import InvalidParameter, GummyRecognizer
from audio2text import VoskRecognizer
from sysaudio import AudioStream


def main_gummy(s: str, t: str, a: int, c: int, k: str):
    global thread_data
    stream = AudioStream(a, c)
    if t == 'none':
        engine = GummyRecognizer(stream.RATE, s, None, k)
    else:
        engine = GummyRecognizer(stream.RATE, s, t, k)

    stream.open_stream()
    engine.start()
    chunk_mono = bytes()

    restart_count = 0
    while thread_data.status == "running":
        try:
            chunk = stream.read_chunk()
            if chunk is None: continue
            chunk_mono = merge_chunk_channels(chunk, stream.CHANNELS)
            try:
                engine.send_audio_frame(chunk_mono)
            except InvalidParameter as e:
                restart_count += 1
                if restart_count > 5:
                    stdout_err(str(e))
                    thread_data.status = "kill"
                    stdout_cmd('kill')
                    break
                else:
                    stdout_cmd('info', f'Gummy engine stopped, restart attempt: {restart_count}...')
        except KeyboardInterrupt:
            break

    engine.send_audio_frame(chunk_mono)
    stream.close_stream()
    engine.stop()


def main_vosk(a: int, c: int, m: str, t: str):
    global thread_data
    stream = AudioStream(a, c)
    engine = VoskRecognizer(m, None if t == 'none' else t)

    stream.open_stream()
    engine.start()

    while thread_data.status == "running":
        try:
            chunk = stream.read_chunk()
            if chunk is None: continue
            chunk_mono = resample_chunk_mono(chunk, stream.CHANNELS, stream.RATE, 16000)
            engine.send_audio_frame(chunk_mono)
        except KeyboardInterrupt:
            break

    stream.close_stream()
    engine.stop()


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Convert system audio stream to text')
    # both
    parser.add_argument('-e', '--caption_engine', default='gummy', help='Caption engine: gummy or vosk')
    parser.add_argument('-a', '--audio_type', default=0, help='Audio stream source: 0 for output, 1 for input')
    parser.add_argument('-c', '--chunk_rate', default=10, help='Number of audio stream chunks collected per second')
    parser.add_argument('-p', '--port', default=8080, help='The port to run the server on, 0 for no server')
    parser.add_argument('-t', '--target_language', default='zh', help='Target language code, "none" for no translation')
    # gummy only
    parser.add_argument('-s', '--source_language', default='en', help='Source language code')
    parser.add_argument('-k', '--api_key', default='', help='API KEY for Gummy model')
    # vosk only
    parser.add_argument('-m', '--model_path', default='', help='The path to the vosk model.')

    args = parser.parse_args()
    if int(args.port) == 0:
        thread_data.status = "running"
    else:
        start_server(int(args.port))

    if args.caption_engine == 'gummy':
        main_gummy(
            args.source_language,
            args.target_language,
            int(args.audio_type),
            int(args.chunk_rate),
            args.api_key
        )
    elif args.caption_engine == 'vosk':
        main_vosk(
            int(args.audio_type),
            int(args.chunk_rate),
            args.model_path,
            args.target_language
        )
    else:
        raise ValueError('Invalid caption engine specified.')

    if thread_data.status == "kill":
        stdout_cmd('kill')