auto-caption/engine/audio2text/glm.py

import threading
import io
import wave
import struct
import math
import audioop
import requests
from datetime import datetime

from utils import shared_data
from utils import stdout_cmd, stdout_obj, google_translate, ollama_translate

class GlmRecognizer:
    """
    使用 GLM-ASR 引擎处理音频数据，并在标准输出中输出 Auto Caption 软件可读取的 JSON 字符串数据

    初始化参数：
        url: GLM-ASR API URL
        model: GLM-ASR 模型名称
        api_key: GLM-ASR API Key
        source: 源语言
        target: 目标语言
        trans_model: 翻译模型名称
        ollama_name: Ollama 模型名称
    """
    def __init__(self, url: str, model: str, api_key: str, source: str, target: str | None, trans_model: str, ollama_name: str, ollama_url: str = '', ollama_api_key: str = ''):
        self.url = url
        self.model = model
        self.api_key = api_key
        self.source = source
        self.target = target
        if trans_model == 'google':
            self.trans_func = google_translate
        else:
            self.trans_func = ollama_translate
        self.ollama_name = ollama_name
        self.ollama_url = ollama_url
        self.ollama_api_key = ollama_api_key

        self.audio_buffer = []
        self.is_speech = False
        self.silence_frames = 0
        self.speech_start_time = None
        self.time_str = ''
        self.cur_id = 0

        # VAD settings (假设 16k 16bit, chunk size 1024 or similar)
        # 16bit = 2 bytes per sample.
        # RMS threshold needs tuning. 500 is a conservative guess for silence.
        self.threshold = 500
        self.silence_limit = 15 # frames (approx 0.5-1s depending on chunk size)
        self.min_speech_frames = 10 # frames

    def start(self):
        """启动 GLM 引擎"""
        stdout_cmd('info', 'GLM-ASR recognizer started.')

    def stop(self):
        """停止 GLM 引擎"""
        stdout_cmd('info', 'GLM-ASR recognizer stopped.')

    def process_audio(self, chunk):
        # chunk is bytes (int16)
        rms = audioop.rms(chunk, 2)

        if rms > self.threshold:
            if not self.is_speech:
                self.is_speech = True
                self.time_str = datetime.now().strftime('%H:%M:%S.%f')[:-3]
                self.audio_buffer = []
            self.audio_buffer.append(chunk)
            self.silence_frames = 0
        else:
            if self.is_speech:
                self.audio_buffer.append(chunk)
                self.silence_frames += 1
                if self.silence_frames > self.silence_limit:
                    # Speech ended
                    if len(self.audio_buffer) > self.min_speech_frames:
                        self.recognize(self.audio_buffer, self.time_str)
                    self.is_speech = False
                    self.audio_buffer = []
                    self.silence_frames = 0

    def recognize(self, audio_frames, time_s):
        audio_bytes = b''.join(audio_frames)

        wav_io = io.BytesIO()
        with wave.open(wav_io, 'wb') as wav_file:
            wav_file.setnchannels(1)
            wav_file.setsampwidth(2)
            wav_file.setframerate(16000)
            wav_file.writeframes(audio_bytes)
        wav_io.seek(0)

        threading.Thread(
            target=self._do_request,
            args=(wav_io.read(), time_s, self.cur_id)
        ).start()
        self.cur_id += 1

    def _do_request(self, audio_content, time_s, index):
        try:
            files = {
                'file': ('audio.wav', audio_content, 'audio/wav')
            }
            data = {
                'model': self.model,
                'stream': 'false'
            }
            headers = {
                'Authorization': f'Bearer {self.api_key}'
            }

            response = requests.post(self.url, headers=headers, data=data, files=files, timeout=15)

            if response.status_code == 200:
                res_json = response.json()
                text = res_json.get('text', '')
                if text:
                    self.output_caption(text, time_s, index)
            else:
                try:
                    err_msg = response.json()
                    stdout_cmd('error', f"GLM API Error: {err_msg}")
                except:
                    stdout_cmd('error', f"GLM API Error: {response.text}")

        except Exception as e:
            stdout_cmd('error', f"GLM Request Failed: {str(e)}")

    def output_caption(self, text, time_s, index):
        caption = {
            'command': 'caption',
            'index': index,
            'time_s': time_s,
            'time_t': datetime.now().strftime('%H:%M:%S.%f')[:-3],
            'text': text,
            'translation': ''
        }

        if self.target:
             if self.trans_func == ollama_translate:
                 th = threading.Thread(
                    target=self.trans_func,
                    args=(self.ollama_name, self.target, caption['text'], time_s, self.ollama_url, self.ollama_api_key),
                    daemon=True
                )
             else:
                 th = threading.Thread(
                    target=self.trans_func,
                    args=(self.ollama_name, self.target, caption['text'], time_s),
                    daemon=True
                )
             th.start()

        stdout_obj(caption)

    def translate(self):
        global shared_data
        while shared_data.status == 'running':
            chunk = shared_data.chunk_queue.get()
            self.process_audio(chunk)