From 50ea9c5e4cc6a075cf7cda35220e1c515d6b6ff8 Mon Sep 17 00:00:00 2001
From: himeditator <hironin@foxmail.com>
Date: Sat, 5 Jul 2025 12:45:43 +0800
Subject: [PATCH] =?UTF-8?q?refactor(caption):=20=E9=87=8D=E6=9E=84?=
 =?UTF-8?q?=E5=AD=97=E5=B9=95=E5=BC=95=E6=93=8E=E7=BB=93=E6=9E=84=E3=80=81?=
 =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=AD=97=E5=B9=95=E5=BC=95=E6=93=8E=E7=A9=BA?=
 =?UTF-8?q?=E7=BD=AE=E6=8A=A5=E9=94=99=20(#2)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 修复gummy字幕引擎长时间空置报错的问题
- 将 python-subprocess 文件夹重命名为 caption-engine
- 删除未使用的 prototype 代码
---
 .gitignore                                    |   2 +-
 README.md                                     |   4 +-
 README_en.md                                  |   4 +-
 assets/engine-manual_zh.md                    |   4 +-
 .../audio2text/gummy.py                       |  10 +-
 .../main-gummy.py                             |   6 +-
 .../main-gummy.spec                           |   0
 caption-engine/requirements.txt               |   5 +
 .../sysaudio/linux.py                         |   0
 .../sysaudio/win.py                           |   8 +-
 electron-builder.yml                          |   4 +-
 python-prototype/gummy.ipynb                  | 221 ------------------
 python-prototype/requirements.txt             |   4 -
 python-subprocess/requirements.txt            | Bin 214 -> 0 bytes
 src/main/utils/CaptionEngine.ts               |   4 +-
 15 files changed, 31 insertions(+), 245 deletions(-)
 rename {python-subprocess => caption-engine}/audio2text/gummy.py (96%)
 rename {python-subprocess => caption-engine}/main-gummy.py (89%)
 rename {python-subprocess => caption-engine}/main-gummy.spec (100%)
 create mode 100644 caption-engine/requirements.txt
 rename {python-subprocess => caption-engine}/sysaudio/linux.py (100%)
 rename {python-subprocess => caption-engine}/sysaudio/win.py (98%)
 delete mode 100644 python-prototype/gummy.ipynb
 delete mode 100644 python-prototype/requirements.txt
 delete mode 100644 python-subprocess/requirements.txt

diff --git a/.gitignore b/.gitignore
index f385eb0..60312fd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,4 +6,4 @@ out
 *.log*
 __pycache__
 subenv
-python-subprocess/build
\ No newline at end of file
+caption-engine/build
diff --git a/README.md b/README.md
index 1994c57..d3b4b58 100644
--- a/README.md
+++ b/README.md
@@ -59,7 +59,7 @@ npm install
 >
 > 本项目的 gummy 字幕引擎是一个 python 子程序，通过 pyinstaller 打包为可执行文件。 运行字幕引擎子程序的代码在 `src\main\utils\engine.ts` 文件中。
 
-首先进入 `python-subprocess` 文件夹，执行如下指令创建虚拟环境：
+首先进入 `caption-engine` 文件夹，执行如下指令创建虚拟环境：
 
 ```bash
 python -m venv subenv
@@ -86,7 +86,7 @@ pip install -r requirements.txt
 pyinstaller --onefile main-gummy.py
 ```
 
-此时项目构建完成，在进入 `python-subprocess/dist` 文件夹可见对应的可执行文件。即可进行后续操作。
+此时项目构建完成，在进入 `caption-engine/dist` 文件夹可见对应的可执行文件。即可进行后续操作。
 
 ### 运行项目
 
diff --git a/README_en.md b/README_en.md
index 3edc398..eeaadec 100644
--- a/README_en.md
+++ b/README_en.md
@@ -65,7 +65,7 @@ npm install
 >
 > The gummy subtitle engine in this project is a Python subprocess, packaged into an executable file using pyinstaller. The code for running the subtitle engine subprocess is in the `src\main\utils\engine.ts` file.
 
-First, enter the `python-subprocess` folder and execute the following command to create a virtual environment:
+First, enter the `caption-engine` folder and execute the following command to create a virtual environment:
 
 ```bash
 python -m venv subenv
@@ -92,7 +92,7 @@ Then build the project using `pyinstaller`:
 pyinstaller --onefile main-gummy.py
 ```
 
-At this point, the project is built. You can find the corresponding executable file in the `python-subprocess/dist` folder. You can proceed with further operations.
+At this point, the project is built. You can find the corresponding executable file in the `caption-engine/dist` folder. You can proceed with further operations.
 
 ### Run the Project
 
diff --git a/assets/engine-manual_zh.md b/assets/engine-manual_zh.md
index b19aeb8..6df6a22 100644
--- a/assets/engine-manual_zh.md
+++ b/assets/engine-manual_zh.md
@@ -39,7 +39,7 @@ export interface CaptionItem {
 如果使用 python 语言，可以参考以下方式将数据传递给主程序：
 
 ```python
-# python-subprocess\audio2text\gummy.py
+# caption-engine\audio2text\gummy.py
 ...
     def send_to_node(self, data):
         """
@@ -84,4 +84,4 @@ export interface CaptionItem {
 
 ## 参考代码
 
-本项目 `python-subprocess` 文件夹下的 `main-gummy.py` 文件为默认字幕引擎的入口代码。`src\main\utils\engine.ts` 为服务端获取字幕引擎数据和进行处理的代码。可以根据需要阅读了解字幕引擎的实现细节和完整运行过程。
\ No newline at end of file
+本项目 `caption-engine` 文件夹下的 `main-gummy.py` 文件为默认字幕引擎的入口代码。`src\main\utils\engine.ts` 为服务端获取字幕引擎数据和进行处理的代码。可以根据需要阅读了解字幕引擎的实现细节和完整运行过程。
diff --git a/python-subprocess/audio2text/gummy.py b/caption-engine/audio2text/gummy.py
similarity index 96%
rename from python-subprocess/audio2text/gummy.py
rename to caption-engine/audio2text/gummy.py
index 9f420f8..1d7de96 100644
--- a/python-subprocess/audio2text/gummy.py
+++ b/caption-engine/audio2text/gummy.py
@@ -2,7 +2,7 @@ from dashscope.audio.asr import (
     TranslationRecognizerCallback,
     TranscriptionResult,
     TranslationResult,
-    TranslationRecognizerRealtime    
+    TranslationRecognizerRealtime
 )
 from datetime import datetime
 import json
@@ -17,11 +17,13 @@ class Callback(TranslationRecognizerCallback):
         self.usage = 0
         self.cur_id = -1
         self.time_str = ''
-    
+
     def on_open(self) -> None:
+        # print("on_open")
         pass
 
     def on_close(self) -> None:
+        # print("on_close")
         pass
 
     def on_event(
@@ -44,11 +46,11 @@ class Callback(TranslationRecognizerCallback):
                 caption['time_s'] = self.time_str
             caption['time_t'] = datetime.now().strftime('%H:%M:%S')
             caption['translation'] = ""
-        
+
         if translation_result is not None:
             lang = translation_result.get_language_list()[0]
             caption['translation'] = translation_result.get_translation(lang).text
-        
+
         if usage:
             self.usage += usage['duration']
 
diff --git a/python-subprocess/main-gummy.py b/caption-engine/main-gummy.py
similarity index 89%
rename from python-subprocess/main-gummy.py
rename to caption-engine/main-gummy.py
index 5b3903e..df648e3 100644
--- a/python-subprocess/main-gummy.py
+++ b/caption-engine/main-gummy.py
@@ -27,7 +27,11 @@ def convert_audio_to_text(s_lang, t_lang, audio_type):
             if not stream.stream: continue
             data = stream.stream.read(stream.CHUNK)
             data = mergeStreamChannels(data, stream.CHANNELS)
-            gummy.translator.send_audio_frame(data)
+            try:
+                gummy.translator.send_audio_frame(data)
+            except:
+                gummy.translator.start()
+                gummy.translator.send_audio_frame(data)
         except KeyboardInterrupt:
             stream.closeStream()
             gummy.translator.stop()
diff --git a/python-subprocess/main-gummy.spec b/caption-engine/main-gummy.spec
similarity index 100%
rename from python-subprocess/main-gummy.spec
rename to caption-engine/main-gummy.spec
diff --git a/caption-engine/requirements.txt b/caption-engine/requirements.txt
new file mode 100644
index 0000000..eefe30b
--- /dev/null
+++ b/caption-engine/requirements.txt
@@ -0,0 +1,5 @@
+dashscope==1.23.5
+numpy==2.2.6
+PyAudio==0.2.14
+PyAudioWPatch==0.2.12.7 # Windows only
+pyinstaller==6.14.1
diff --git a/python-subprocess/sysaudio/linux.py b/caption-engine/sysaudio/linux.py
similarity index 100%
rename from python-subprocess/sysaudio/linux.py
rename to caption-engine/sysaudio/linux.py
diff --git a/python-subprocess/sysaudio/win.py b/caption-engine/sysaudio/win.py
similarity index 98%
rename from python-subprocess/sysaudio/win.py
rename to caption-engine/sysaudio/win.py
index f7c7af6..bb7b121 100644
--- a/python-subprocess/sysaudio/win.py
+++ b/caption-engine/sysaudio/win.py
@@ -35,7 +35,7 @@ def getDefaultLoopbackDevice(mic: pyaudio.PyAudio, info = True)->dict:
             print("Run `python -m pyaudiowpatch` to check available devices.")
             print("Exiting...")
             exit()
-            
+
     if(info): print(f"Output Stream Device: #{default_speaker['index']} {default_speaker['name']}")
     return default_speaker
 
@@ -64,7 +64,7 @@ def mergeStreamChannels(data, channels):
 class AudioStream:
     """
     获取系统音频流
-    
+
     参数：
         audio_type: （默认）0-系统音频输出流，1-系统音频输入流
     """
@@ -116,7 +116,7 @@ class AudioStream:
             input_device_index = self.INDEX
         )
         return self.stream
-    
+
     def closeStream(self):
         """
         关闭系统音频输出流
@@ -124,4 +124,4 @@ class AudioStream:
         if self.stream is None: return
         self.stream.stop_stream()
         self.stream.close()
-        self.stream = None
\ No newline at end of file
+        self.stream = None
diff --git a/electron-builder.yml b/electron-builder.yml
index ef543cc..fc3a4cc 100644
--- a/electron-builder.yml
+++ b/electron-builder.yml
@@ -10,8 +10,8 @@ files:
   - '!{.env,.env.*,.npmrc,pnpm-lock.yaml}'
   - '!{tsconfig.json,tsconfig.node.json,tsconfig.web.json}'
 extraResources:
-  from: ./python-subprocess/dist/main-gummy.exe
-  to: ./python-subprocess/dist/main-gummy.exe
+  from: ./caption-engine/dist/main-gummy.exe
+  to: ./caption-engine/dist/main-gummy.exe
 asarUnpack:
   - resources/**
 win:
diff --git a/python-prototype/gummy.ipynb b/python-prototype/gummy.ipynb
deleted file mode 100644
index a105f29..0000000
--- a/python-prototype/gummy.ipynb
+++ /dev/null
@@ -1,221 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from dashscope.audio.asr import *\n",
-    "import pyaudiowpatch as pyaudio\n",
-    "import numpy as np\n",
-    "\n",
-    "\n",
-    "def getDefaultSpeakers(mic: pyaudio.PyAudio, info = True):\n",
-    "    \"\"\"\n",
-    "    获取默认的系统音频输出的回环设备\n",
-    "    Args:\n",
-    "        mic (pyaudio.PyAudio): pyaudio对象\n",
-    "        info (bool, optional): 是否打印设备信息. Defaults to True.\n",
-    "\n",
-    "    Returns:\n",
-    "        dict: 统音频输出的回环设备\n",
-    "    \"\"\"\n",
-    "    try:\n",
-    "        WASAPI_info = mic.get_host_api_info_by_type(pyaudio.paWASAPI)\n",
-    "    except OSError:\n",
-    "        print(\"Looks like WASAPI is not available on the system. Exiting...\")\n",
-    "        exit()\n",
-    "\n",
-    "    default_speaker = mic.get_device_info_by_index(WASAPI_info[\"defaultOutputDevice\"])\n",
-    "    if(info): print(\"wasapi_info:\\n\", WASAPI_info, \"\\n\")\n",
-    "    if(info): print(\"default_speaker:\\n\", default_speaker, \"\\n\")\n",
-    "\n",
-    "    if not default_speaker[\"isLoopbackDevice\"]:\n",
-    "        for loopback in mic.get_loopback_device_info_generator():\n",
-    "            if default_speaker[\"name\"] in loopback[\"name\"]:\n",
-    "                default_speaker = loopback\n",
-    "                if(info): print(\"Using loopback device:\\n\", default_speaker, \"\\n\")\n",
-    "                break\n",
-    "        else:\n",
-    "            print(\"Default loopback output device not found.\")\n",
-    "            print(\"Run `python -m pyaudiowpatch` to check available devices.\")\n",
-    "            print(\"Exiting...\")\n",
-    "            exit()\n",
-    "            \n",
-    "    if(info): print(f\"Recording Device: #{default_speaker['index']} {default_speaker['name']}\")\n",
-    "    return default_speaker\n",
-    "\n",
-    "\n",
-    "class Callback(TranslationRecognizerCallback):\n",
-    "    \"\"\"\n",
-    "    语音大模型流式传输回调对象\n",
-    "    \"\"\"\n",
-    "    def __init__(self):\n",
-    "        super().__init__()\n",
-    "        self.usage = 0\n",
-    "        self.sentences = []\n",
-    "        self.translations = []\n",
-    "    \n",
-    "    def on_open(self) -> None:\n",
-    "        print(\"\\n流式翻译开始...\\n\")\n",
-    "\n",
-    "    def on_close(self) -> None:\n",
-    "        print(f\"\\nTokens消耗：{self.usage}\")\n",
-    "        print(f\"流式翻译结束...\\n\")\n",
-    "        for i in range(len(self.sentences)):\n",
-    "            print(f\"\\n{self.sentences[i]}\\n{self.translations[i]}\\n\")\n",
-    "\n",
-    "    def on_event(\n",
-    "        self,\n",
-    "        request_id,\n",
-    "        transcription_result: TranscriptionResult,\n",
-    "        translation_result: TranslationResult,\n",
-    "        usage\n",
-    "    ) -> None:\n",
-    "        if transcription_result is not None:\n",
-    "            id = transcription_result.sentence_id\n",
-    "            text = transcription_result.text\n",
-    "            if transcription_result.stash is not None:\n",
-    "                stash = transcription_result.stash.text\n",
-    "            else:\n",
-    "                stash = \"\"\n",
-    "            print(f\"#{id}: {text}{stash}\")\n",
-    "            if usage: self.sentences.append(text)\n",
-    "        \n",
-    "        if translation_result is not None:\n",
-    "            lang = translation_result.get_language_list()[0]\n",
-    "            text = translation_result.get_translation(lang).text\n",
-    "            if translation_result.get_translation(lang).stash is not None:\n",
-    "                stash = translation_result.get_translation(lang).stash.text\n",
-    "            else:\n",
-    "                stash = \"\"\n",
-    "            print(f\"#{lang}: {text}{stash}\")\n",
-    "            if usage: self.translations.append(text)\n",
-    "        \n",
-    "        if usage: self.usage += usage['duration']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "采样输入设备：\n",
-      "    - 序号：37\n",
-      "    - 名称：耳机 (HUAWEI FreeLace 活力版) [Loopback]\n",
-      "    - 最大输入通道数：2\n",
-      "    - 默认低输入延迟：0.003s\n",
-      "    - 默认高输入延迟：0.01s\n",
-      "    - 默认采样率：44100.0Hz\n",
-      "    - 是否回环设备：True\n",
-      "\n",
-      "音频样本块大小：4410\n",
-      "样本位宽：2\n",
-      "音频数据格式：8\n",
-      "音频通道数：2\n",
-      "音频采样率：44100\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "mic = pyaudio.PyAudio()\n",
-    "default_speaker = getDefaultSpeakers(mic, False)\n",
-    "\n",
-    "SAMP_WIDTH = pyaudio.get_sample_size(pyaudio.paInt16)\n",
-    "FORMAT = pyaudio.paInt16\n",
-    "CHANNELS = default_speaker[\"maxInputChannels\"]\n",
-    "RATE = int(default_speaker[\"defaultSampleRate\"])\n",
-    "CHUNK = RATE // 10\n",
-    "INDEX = default_speaker[\"index\"]\n",
-    "\n",
-    "dev_info = f\"\"\"\n",
-    "采样输入设备：\n",
-    "    - 序号：{default_speaker['index']}\n",
-    "    - 名称：{default_speaker['name']}\n",
-    "    - 最大输入通道数：{default_speaker['maxInputChannels']}\n",
-    "    - 默认低输入延迟：{default_speaker['defaultLowInputLatency']}s\n",
-    "    - 默认高输入延迟：{default_speaker['defaultHighInputLatency']}s\n",
-    "    - 默认采样率：{default_speaker['defaultSampleRate']}Hz\n",
-    "    - 是否回环设备：{default_speaker['isLoopbackDevice']}\n",
-    "\n",
-    "音频样本块大小：{CHUNK}\n",
-    "样本位宽：{SAMP_WIDTH}\n",
-    "音频数据格式：{FORMAT}\n",
-    "音频通道数：{CHANNELS}\n",
-    "音频采样率：{RATE}\n",
-    "\"\"\"\n",
-    "print(dev_info)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "RECORD_SECONDS = 20 # 监听时长(s)\n",
-    "\n",
-    "stream = mic.open(\n",
-    "    format = FORMAT,\n",
-    "    channels = CHANNELS,\n",
-    "    rate = RATE,\n",
-    "    input = True,\n",
-    "    input_device_index = INDEX\n",
-    ")\n",
-    "translator = TranslationRecognizerRealtime(\n",
-    "    model = \"gummy-realtime-v1\",\n",
-    "    format = \"pcm\",\n",
-    "    sample_rate = RATE,\n",
-    "    transcription_enabled = True,\n",
-    "    translation_enabled = True,\n",
-    "    source_language = \"ja\",\n",
-    "    translation_target_languages = [\"zh\"],\n",
-    "    callback = Callback()\n",
-    ")\n",
-    "translator.start()\n",
-    "\n",
-    "for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):\n",
-    "    data = stream.read(CHUNK)\n",
-    "    data_np = np.frombuffer(data, dtype=np.int16)\n",
-    "    data_np_r = data_np.reshape(-1, CHANNELS)\n",
-    "    print(data_np_r.shape)\n",
-    "    mono_data = np.mean(data_np_r.astype(np.float32), axis=1)\n",
-    "    mono_data = mono_data.astype(np.int16)\n",
-    "    mono_data_bytes = mono_data.tobytes()\n",
-    "    translator.send_audio_frame(mono_data_bytes)\n",
-    "\n",
-    "translator.stop()\n",
-    "stream.stop_stream()\n",
-    "stream.close()"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "mystd",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/python-prototype/requirements.txt b/python-prototype/requirements.txt
deleted file mode 100644
index 12dede0..0000000
--- a/python-prototype/requirements.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-numpy
-dashscope
-pyaudio
-pyaudiowpatch
\ No newline at end of file
diff --git a/python-subprocess/requirements.txt b/python-subprocess/requirements.txt
deleted file mode 100644
index d888ae587250a5bc2a677efdfe931c236466b60e..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 214
zcmZXOF$%&!5Jg{|Qv~dTi5e>lZ(x@?7$HL3h%tzlSKlsH$}+PsZ~mYEes#1=^wyD)
zD<^iol7fnreM3fI<|2D0RwY(YOAgNV(vGw0s9b(&$@-5s?zh~%)c@>8&n-C%tI;dy
fIPx%6?jIPulcfZaO?tG>2Gib>Q>{K;j6LH6AB!Rv

diff --git a/src/main/utils/CaptionEngine.ts b/src/main/utils/CaptionEngine.ts
index a7c977a..4581e4c 100644
--- a/src/main/utils/CaptionEngine.ts
+++ b/src/main/utils/CaptionEngine.ts
@@ -37,13 +37,13 @@ export class CaptionEngine {
       if (is.dev) {
         this.appPath = path.join(
           app.getAppPath(),
-          'python-subprocess', 'dist', gummyName
+          'caption-engine', 'dist', gummyName
         )
       }
       else {
         this.appPath = path.join(
           process.resourcesPath,
-          'python-subprocess', 'dist', gummyName
+          'caption-engine', 'dist', gummyName
         )
       }
       this.command = []