feat(engine): 添加GLM-ASR语音识别引擎支持

- 新增GLM-ASR云端语音识别引擎实现 - 扩展配置界面添加GLM相关参数设置 - Ollama支持自定义域名和Apikey以支持云端和其他LLM - 修改音频处理逻辑以支持新引擎 - 更新依赖项和构建配置 - 修复Ollama翻译功能相关问题
2026-02-04 12:24:42 +08:00 · 2026-01-10 16:02:24 +08:00
parent 383e582a2d
commit 0825e48902
19 changed files with 519 additions and 91 deletions
--- a/engine/audio2text/init.py
+++ b/engine/audio2text/init.py
@@ -1,3 +1,4 @@
 from .gummy import GummyRecognizer
 from .vosk import VoskRecognizer
-from .sosv import SosvRecognizer
+from .sosv import SosvRecognizer
+from .glm import GlmRecognizer
--- a/engine/audio2text/glm.py
+++ b/engine/audio2text/glm.py
@@ -0,0 +1,163 @@
+import threading
+import io
+import wave
+import struct
+import math
+import audioop
+import requests
+from datetime import datetime
+
+from utils import shared_data
+from utils import stdout_cmd, stdout_obj, google_translate, ollama_translate
+
+class GlmRecognizer:
+    """
+    使用 GLM-ASR 引擎处理音频数据，并在标准输出中输出 Auto Caption 软件可读取的 JSON 字符串数据
+
+    初始化参数：
+        url: GLM-ASR API URL
+        model: GLM-ASR 模型名称
+        api_key: GLM-ASR API Key
+        source: 源语言
+        target: 目标语言
+        trans_model: 翻译模型名称
+        ollama_name: Ollama 模型名称
+    """
+    def __init__(self, url: str, model: str, api_key: str, source: str, target: str | None, trans_model: str, ollama_name: str, ollama_url: str = '', ollama_api_key: str = ''):
+        self.url = url
+        self.model = model
+        self.api_key = api_key
+        self.source = source
+        self.target = target
+        if trans_model == 'google':
+            self.trans_func = google_translate
+        else:
+            self.trans_func = ollama_translate
+        self.ollama_name = ollama_name
+        self.ollama_url = ollama_url
+        self.ollama_api_key = ollama_api_key
+        
+        self.audio_buffer = []
+        self.is_speech = False
+        self.silence_frames = 0
+        self.speech_start_time = None
+        self.time_str = ''
+        self.cur_id = 0
+        
+        # VAD settings (假设 16k 16bit, chunk size 1024 or similar)
+        # 16bit = 2 bytes per sample.
+        # RMS threshold needs tuning. 500 is a conservative guess for silence.
+        self.threshold = 500 
+        self.silence_limit = 15 # frames (approx 0.5-1s depending on chunk size)
+        self.min_speech_frames = 10 # frames
+
+    def start(self):
+        """启动 GLM 引擎"""
+        stdout_cmd('info', 'GLM-ASR recognizer started.')
+
+    def stop(self):
+        """停止 GLM 引擎"""
+        stdout_cmd('info', 'GLM-ASR recognizer stopped.')
+
+    def process_audio(self, chunk):
+        # chunk is bytes (int16)
+        rms = audioop.rms(chunk, 2)
+        
+        if rms > self.threshold:
+            if not self.is_speech:
+                self.is_speech = True
+                self.time_str = datetime.now().strftime('%H:%M:%S.%f')[:-3]
+                self.audio_buffer = []
+            self.audio_buffer.append(chunk)
+            self.silence_frames = 0
+        else:
+            if self.is_speech:
+                self.audio_buffer.append(chunk)
+                self.silence_frames += 1
+                if self.silence_frames > self.silence_limit:
+                    # Speech ended
+                    if len(self.audio_buffer) > self.min_speech_frames:
+                        self.recognize(self.audio_buffer, self.time_str)
+                    self.is_speech = False
+                    self.audio_buffer = []
+                    self.silence_frames = 0
+    
+    def recognize(self, audio_frames, time_s):
+        audio_bytes = b''.join(audio_frames)
+        
+        wav_io = io.BytesIO()
+        with wave.open(wav_io, 'wb') as wav_file:
+            wav_file.setnchannels(1)
+            wav_file.setsampwidth(2)
+            wav_file.setframerate(16000)
+            wav_file.writeframes(audio_bytes)
+        wav_io.seek(0)
+        
+        threading.Thread(
+            target=self._do_request, 
+            args=(wav_io.read(), time_s, self.cur_id)
+        ).start()
+        self.cur_id += 1
+
+    def _do_request(self, audio_content, time_s, index):
+        try:
+            files = {
+                'file': ('audio.wav', audio_content, 'audio/wav')
+            }
+            data = {
+                'model': self.model,
+                'stream': 'false'
+            }
+            headers = {
+                'Authorization': f'Bearer {self.api_key}'
+            }
+            
+            response = requests.post(self.url, headers=headers, data=data, files=files, timeout=15)
+            
+            if response.status_code == 200:
+                res_json = response.json()
+                text = res_json.get('text', '')
+                if text:
+                    self.output_caption(text, time_s, index)
+            else:
+                try:
+                    err_msg = response.json()
+                    stdout_cmd('error', f"GLM API Error: {err_msg}")
+                except:
+                    stdout_cmd('error', f"GLM API Error: {response.text}")
+                
+        except Exception as e:
+            stdout_cmd('error', f"GLM Request Failed: {str(e)}")
+
+    def output_caption(self, text, time_s, index):
+        caption = {
+            'command': 'caption',
+            'index': index,
+            'time_s': time_s,
+            'time_t': datetime.now().strftime('%H:%M:%S.%f')[:-3],
+            'text': text,
+            'translation': ''
+        }
+        
+        if self.target:
+             if self.trans_func == ollama_translate:
+                 th = threading.Thread(
+                    target=self.trans_func,
+                    args=(self.ollama_name, self.target, caption['text'], time_s, self.ollama_url, self.ollama_api_key),
+                    daemon=True
+                )
+             else:
+                 th = threading.Thread(
+                    target=self.trans_func,
+                    args=(self.ollama_name, self.target, caption['text'], time_s),
+                    daemon=True
+                )
+             th.start()
+        
+        stdout_obj(caption)
+
+    def translate(self):
+        global shared_data
+        while shared_data.status == 'running':
+            chunk = shared_data.chunk_queue.get()
+            self.process_audio(chunk)
--- a/engine/audio2text/sosv.py
+++ b/engine/audio2text/sosv.py
@@ -29,7 +29,7 @@ class SosvRecognizer:
        trans_model: 翻译模型名称
        ollama_name: Ollama 模型名称
    """
-    def __init__(self, model_path: str, source: str, target: str | None, trans_model: str, ollama_name: str):
+    def __init__(self, model_path: str, source: str, target: str | None, trans_model: str, ollama_name: str, ollama_url: str = '', ollama_api_key: str = ''):
        if model_path.startswith('"'):
            model_path = model_path[1:]
        if model_path.endswith('"'):
@@ -45,6 +45,8 @@ class SosvRecognizer:
        else:
            self.trans_func = ollama_translate
        self.ollama_name = ollama_name
+        self.ollama_url = ollama_url
+        self.ollama_api_key = ollama_api_key
        self.time_str = ''
        self.cur_id = 0
        self.prev_content = ''
@@ -152,7 +154,7 @@ class SosvRecognizer:
                if self.target:
                    th = threading.Thread(
                        target=self.trans_func,
-                        args=(self.ollama_name, self.target, caption['text'], self.time_str),
+                        args=(self.ollama_name, self.target, caption['text'], self.time_str, self.ollama_url, self.ollama_api_key),
                        daemon=True
                    )
                    th.start()    
--- a/engine/audio2text/vosk.py
+++ b/engine/audio2text/vosk.py
@@ -18,7 +18,7 @@ class VoskRecognizer:
        trans_model: 翻译模型名称
        ollama_name: Ollama 模型名称
    """
-    def __init__(self, model_path: str, target: str | None, trans_model: str, ollama_name: str):
+    def __init__(self, model_path: str, target: str | None, trans_model: str, ollama_name: str, ollama_url: str = '', ollama_api_key: str = ''):
        SetLogLevel(-1)
        if model_path.startswith('"'):
            model_path = model_path[1:]
@@ -31,6 +31,8 @@ class VoskRecognizer:
        else:
            self.trans_func = ollama_translate
        self.ollama_name = ollama_name
+        self.ollama_url = ollama_url
+        self.ollama_api_key = ollama_api_key
        self.time_str = ''
        self.cur_id = 0
        self.prev_content = ''
@@ -66,7 +68,7 @@ class VoskRecognizer:
            if self.target:
                th = threading.Thread(
                    target=self.trans_func,
-                    args=(self.ollama_name, self.target, caption['text'], self.time_str),
+                    args=(self.ollama_name, self.target, caption['text'], self.time_str, self.ollama_url, self.ollama_api_key),
                    daemon=True
                )
                th.start()
--- a/engine/main.py
+++ b/engine/main.py
@@ -8,6 +8,7 @@ from utils import merge_chunk_channels, resample_chunk_mono
 from audio2text import GummyRecognizer
 from audio2text import VoskRecognizer
 from audio2text import SosvRecognizer
+from audio2text import GlmRecognizer
 from sysaudio import AudioStream


@@ -74,7 +75,7 @@ def main_gummy(s: str, t: str, a: int, c: int, k: str, r: bool, rp: str):
    engine.stop()


-def main_vosk(a: int, c: int, vosk: str, t: str, tm: str, omn: str, r: bool, rp: str):
+def main_vosk(a: int, c: int, vosk: str, t: str, tm: str, omn: str, ourl: str, okey: str, r: bool, rp: str):
    """
    Parameters:
        a: Audio source: 0 for output, 1 for input
@@ -83,14 +84,16 @@ def main_vosk(a: int, c: int, vosk: str, t: str, tm: str, omn: str, r: bool, rp:
        t: Target language
        tm: Translation model type, ollama or google
        omn: Ollama model name
+        ourl: Ollama Base URL
+        okey: Ollama API Key
        r: Whether to record the audio
        rp: Path to save the recorded audio
    """
    stream = AudioStream(a, c)
    if t == 'none':
-        engine = VoskRecognizer(vosk, None, tm, omn)
+        engine = VoskRecognizer(vosk, None, tm, omn, ourl, okey)
    else:
-        engine = VoskRecognizer(vosk, t, tm, omn)
+        engine = VoskRecognizer(vosk, t, tm, omn, ourl, okey)

    engine.start()
    stream_thread = threading.Thread(
@@ -106,7 +109,7 @@ def main_vosk(a: int, c: int, vosk: str, t: str, tm: str, omn: str, r: bool, rp:
    engine.stop()


-def main_sosv(a: int, c: int, sosv: str, s: str, t: str, tm: str, omn: str, r: bool, rp: str):
+def main_sosv(a: int, c: int, sosv: str, s: str, t: str, tm: str, omn: str, ourl: str, okey: str, r: bool, rp: str):
    """
    Parameters:
        a: Audio source: 0 for output, 1 for input
@@ -116,14 +119,16 @@ def main_sosv(a: int, c: int, sosv: str, s: str, t: str, tm: str, omn: str, r: b
        t: Target language
        tm: Translation model type, ollama or google
        omn: Ollama model name
+        ourl: Ollama API URL
+        okey: Ollama API Key
        r: Whether to record the audio
        rp: Path to save the recorded audio
    """
    stream = AudioStream(a, c)
    if t == 'none':
-        engine = SosvRecognizer(sosv, s, None, tm, omn)
+        engine = SosvRecognizer(sosv, s, None, tm, omn, ourl, okey)
    else:
-        engine = SosvRecognizer(sosv, s, t, tm, omn)
+        engine = SosvRecognizer(sosv, s, t, tm, omn, ourl, okey)

    engine.start()
    stream_thread = threading.Thread(
@@ -139,16 +144,54 @@ def main_sosv(a: int, c: int, sosv: str, s: str, t: str, tm: str, omn: str, r: b
    engine.stop()


+def main_glm(a: int, c: int, url: str, model: str, key: str, s: str, t: str, tm: str, omn: str, ourl: str, okey: str, r: bool, rp: str):
+    """
+    Parameters:
+        a: Audio source
+        c: Chunk rate
+        url: GLM API URL
+        model: GLM Model Name
+        key: GLM API Key
+        s: Source language
+        t: Target language
+        tm: Translation model
+        omn: Ollama model name
+        ourl: Ollama API URL
+        okey: Ollama API Key
+        r: Record
+        rp: Record path
+    """
+    stream = AudioStream(a, c)
+    if t == 'none':
+        engine = GlmRecognizer(url, model, key, s, None, tm, omn, ourl, okey)
+    else:
+        engine = GlmRecognizer(url, model, key, s, t, tm, omn, ourl, okey)
+    
+    engine.start()
+    stream_thread = threading.Thread(
+        target=audio_recording,
+        args=(stream, True, r, rp),
+        daemon=True
+    )
+    stream_thread.start()
+    try:
+        engine.translate()
+    except KeyboardInterrupt:
+        stdout("Keyboard interrupt detected. Exiting...")
+    engine.stop()
+
+
+
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Convert system audio stream to text')
    # all
    parser.add_argument('-e', '--caption_engine', default='gummy', help='Caption engine: gummy or vosk or sosv')
-    parser.add_argument('-a', '--audio_type', default=0, help='Audio stream source: 0 for output, 1 for input')
-    parser.add_argument('-c', '--chunk_rate', default=10, help='Number of audio stream chunks collected per second')
-    parser.add_argument('-p', '--port', default=0, help='The port to run the server on, 0 for no server')
-    parser.add_argument('-d', '--display_caption', default=0, help='Display caption on terminal, 0 for no display, 1 for display')
+    parser.add_argument('-a', '--audio_type', type=int, default=0, help='Audio stream source: 0 for output, 1 for input')
+    parser.add_argument('-c', '--chunk_rate', type=int, default=10, help='Number of audio stream chunks collected per second')
+    parser.add_argument('-p', '--port', type=int, default=0, help='The port to run the server on, 0 for no server')
+    parser.add_argument('-d', '--display_caption', type=int, default=0, help='Display caption on terminal, 0 for no display, 1 for display')
    parser.add_argument('-t', '--target_language', default='none', help='Target language code, "none" for no translation')
-    parser.add_argument('-r', '--record', default=0, help='Whether to record the audio, 0 for no recording, 1 for recording')
+    parser.add_argument('-r', '--record', type=int, default=0, help='Whether to record the audio, 0 for no recording, 1 for recording')
    parser.add_argument('-rp', '--record_path', default='', help='Path to save the recorded audio')
    # gummy and sosv
    parser.add_argument('-s', '--source_language', default='auto', help='Source language code')
@@ -157,20 +200,24 @@ if __name__ == "__main__":
    # vosk and sosv
    parser.add_argument('-tm', '--translation_model', default='ollama', help='Model for translation: ollama or google')
    parser.add_argument('-omn', '--ollama_name', default='', help='Ollama model name for translation')
+    parser.add_argument('-ourl', '--ollama_url', default='', help='Ollama API URL')
+    parser.add_argument('-okey', '--ollama_api_key', default='', help='Ollama API Key')
    # vosk only
    parser.add_argument('-vosk', '--vosk_model', default='', help='The path to the vosk model.')
    # sosv only
    parser.add_argument('-sosv', '--sosv_model', default=None, help='The SenseVoice model path')
+    # glm only
+    parser.add_argument('-gurl', '--glm_url', default='https://open.bigmodel.cn/api/paas/v4/audio/transcriptions', help='GLM API URL')
+    parser.add_argument('-gmodel', '--glm_model', default='glm-asr-2512', help='GLM Model Name')
+    parser.add_argument('-gkey', '--glm_api_key', default='', help='GLM API Key')

    args = parser.parse_args()
-    if int(args.port) == 0:
-        shared_data.status = "running"
-    else:
-        start_server(int(args.port))
-    
-    if int(args.display_caption) != 0:
+
+    if args.port != 0:
+        threading.Thread(target=start_server, args=(args.port,), daemon=True).start()
+
+    if args.display_caption == '1':
        change_caption_display(True)
-        print("Caption will be displayed on terminal")

    if args.caption_engine == 'gummy':
        main_gummy(
@@ -179,7 +226,7 @@ if __name__ == "__main__":
            int(args.audio_type),
            int(args.chunk_rate),
            args.api_key,
-            True if int(args.record) == 1 else False,
+            bool(int(args.record)),
            args.record_path
        )
    elif args.caption_engine == 'vosk':
@@ -190,7 +237,9 @@ if __name__ == "__main__":
            args.target_language,
            args.translation_model,
            args.ollama_name,
-            True if int(args.record) == 1 else False,
+            args.ollama_url,
+            args.ollama_api_key,
+            bool(int(args.record)),
            args.record_path
        )
    elif args.caption_engine == 'sosv':
@@ -202,7 +251,25 @@ if __name__ == "__main__":
            args.target_language,
            args.translation_model,
            args.ollama_name,
-            True if int(args.record) == 1 else False,
+            args.ollama_url,
+            args.ollama_api_key,
+            bool(int(args.record)),
+            args.record_path
+        )
+    elif args.caption_engine == 'glm':
+        main_glm(
+            int(args.audio_type),
+            int(args.chunk_rate),
+            args.glm_url,
+            args.glm_model,
+            args.glm_api_key,
+            args.source_language,
+            args.target_language,
+            args.translation_model,
+            args.ollama_name,
+            args.ollama_url,
+            args.ollama_api_key,
+            bool(int(args.record)),
            args.record_path
        )
    else:
--- a/engine/main.spec
+++ b/engine/main.spec
@@ -6,11 +6,17 @@ import sys
 if sys.platform == 'win32':
    vosk_path = str(Path('./.venv/Lib/site-packages/vosk').resolve())
 else:
-    vosk_path = str(Path('./.venv/lib/python3.12/site-packages/vosk').resolve())
+    venv_lib = Path('./.venv/lib')
+    python_dirs = list(venv_lib.glob('python*'))
+    if python_dirs:
+        vosk_path = str((python_dirs[0] / 'site-packages' / 'vosk').resolve())
+    else:
+        vosk_path = str(Path('./.venv/lib/python3.12/site-packages/vosk').resolve())

 a = Analysis(
    ['main.py'],
    pathex=[],
+    # binaries=[('portaudio/lib/.libs/libportaudio.2.dylib', '.')],
    binaries=[],
    datas=[(vosk_path, 'vosk')],
    hiddenimports=[],
@@ -27,21 +33,27 @@ pyz = PYZ(a.pure)
 exe = EXE(
    pyz,
    a.scripts,
-    a.binaries,
-    a.datas,
    [],
+    exclude_binaries=True,
    name='main',
    debug=False,
    bootloader_ignore_signals=False,
    strip=False,
    upx=True,
-    upx_exclude=[],
-    runtime_tmpdir=None,
    console=True,
    disable_windowed_traceback=False,
    argv_emulation=False,
    target_arch=None,
    codesign_identity=None,
    entitlements_file=None,
-    onefile=True,
+)
+
+coll = COLLECT(
+    exe,
+    a.binaries,
+    a.datas,
+    strip=False,
+    upx=True,
+    upx_exclude=[],
+    name='main',
 )
--- a/engine/requirements.txt
+++ b/engine/requirements.txt
@@ -7,4 +7,6 @@ pyaudio; sys_platform == 'darwin'
 pyaudiowpatch; sys_platform == 'win32'
 googletrans
 ollama
-sherpa_onnx
+sherpa_onnx
+requests
+openai
--- a/engine/utils/sysout.py
+++ b/engine/utils/sysout.py
@@ -47,7 +47,6 @@ def translation_display(obj):

 def stdout_obj(obj):
    global display_caption
-    print(obj['command'], display_caption)
    if obj['command'] == 'caption' and display_caption:
        caption_display(obj)
        return
--- a/engine/utils/translation.py
+++ b/engine/utils/translation.py
@@ -1,5 +1,9 @@
-from ollama import chat
+from ollama import chat, Client
 from ollama import ChatResponse
+try:
+    from openai import OpenAI
+except ImportError:
+    OpenAI = None
 import asyncio
 from googletrans import Translator
 from .sysout import stdout_cmd, stdout_obj
@@ -17,15 +21,43 @@ lang_map = {
    'zh-cn': 'Chinese'
 }

-def ollama_translate(model: str, target: str, text: str, time_s: str):
-    response: ChatResponse = chat(
-        model=model,
-        messages=[
-            {"role": "system", "content": f"/no_think Translate the following content into {lang_map[target]}, and do not output any additional information."},
-            {"role": "user", "content": text}
-        ]
-    )
-    content = response.message.content or ""
+def ollama_translate(model: str, target: str, text: str, time_s: str, url: str = '', key: str = ''):
+    content = ""
+    try:
+        if url:
+            if OpenAI:
+                client = OpenAI(base_url=url, api_key=key if key else "ollama")
+                response = client.chat.completions.create(
+                    model=model,
+                    messages=[
+                        {"role": "system", "content": f"/no_think Translate the following content into {lang_map[target]}, and do not output any additional information."},
+                        {"role": "user", "content": text}
+                    ]
+                )
+                content = response.choices[0].message.content or ""
+            else:
+                client = Client(host=url)
+                response: ChatResponse = client.chat(
+                    model=model,
+                    messages=[
+                        {"role": "system", "content": f"/no_think Translate the following content into {lang_map[target]}, and do not output any additional information."},
+                        {"role": "user", "content": text}
+                    ]
+                )
+                content = response.message.content or ""
+        else:
+            response: ChatResponse = chat(
+                model=model,
+                messages=[
+                    {"role": "system", "content": f"/no_think Translate the following content into {lang_map[target]}, and do not output any additional information."},
+                    {"role": "user", "content": text}
+                ]
+            )
+            content = response.message.content or ""
+    except Exception as e:
+        stdout_cmd("warn", f"Translation failed: {str(e)}")
+        return
+
    if content.startswith('<think>'):
        index = content.find('</think>')
        if index != -1: