From 50ea9c5e4cc6a075cf7cda35220e1c515d6b6ff8 Mon Sep 17 00:00:00 2001 From: himeditator Date: Sat, 5 Jul 2025 12:45:43 +0800 Subject: [PATCH] =?UTF-8?q?refactor(caption):=20=E9=87=8D=E6=9E=84?= =?UTF-8?q?=E5=AD=97=E5=B9=95=E5=BC=95=E6=93=8E=E7=BB=93=E6=9E=84=E3=80=81?= =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=AD=97=E5=B9=95=E5=BC=95=E6=93=8E=E7=A9=BA?= =?UTF-8?q?=E7=BD=AE=E6=8A=A5=E9=94=99=20(#2)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 修复gummy字幕引擎长时间空置报错的问题 - 将 python-subprocess 文件夹重命名为 caption-engine - 删除未使用的 prototype 代码 --- .gitignore | 2 +- README.md | 4 +- README_en.md | 4 +- assets/engine-manual_zh.md | 4 +- .../audio2text/gummy.py | 10 +- .../main-gummy.py | 6 +- .../main-gummy.spec | 0 caption-engine/requirements.txt | 5 + .../sysaudio/linux.py | 0 .../sysaudio/win.py | 8 +- electron-builder.yml | 4 +- python-prototype/gummy.ipynb | 221 ------------------ python-prototype/requirements.txt | 4 - python-subprocess/requirements.txt | Bin 214 -> 0 bytes src/main/utils/CaptionEngine.ts | 4 +- 15 files changed, 31 insertions(+), 245 deletions(-) rename {python-subprocess => caption-engine}/audio2text/gummy.py (96%) rename {python-subprocess => caption-engine}/main-gummy.py (89%) rename {python-subprocess => caption-engine}/main-gummy.spec (100%) create mode 100644 caption-engine/requirements.txt rename {python-subprocess => caption-engine}/sysaudio/linux.py (100%) rename {python-subprocess => caption-engine}/sysaudio/win.py (98%) delete mode 100644 python-prototype/gummy.ipynb delete mode 100644 python-prototype/requirements.txt delete mode 100644 python-subprocess/requirements.txt diff --git a/.gitignore b/.gitignore index f385eb0..60312fd 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,4 @@ out *.log* __pycache__ subenv -python-subprocess/build \ No newline at end of file +caption-engine/build diff --git a/README.md b/README.md index 1994c57..d3b4b58 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ npm install > > 本项目的 gummy 字幕引擎是一个 python 子程序,通过 pyinstaller 打包为可执行文件。 运行字幕引擎子程序的代码在 `src\main\utils\engine.ts` 文件中。 -首先进入 `python-subprocess` 文件夹,执行如下指令创建虚拟环境: +首先进入 `caption-engine` 文件夹,执行如下指令创建虚拟环境: ```bash python -m venv subenv @@ -86,7 +86,7 @@ pip install -r requirements.txt pyinstaller --onefile main-gummy.py ``` -此时项目构建完成,在进入 `python-subprocess/dist` 文件夹可见对应的可执行文件。即可进行后续操作。 +此时项目构建完成,在进入 `caption-engine/dist` 文件夹可见对应的可执行文件。即可进行后续操作。 ### 运行项目 diff --git a/README_en.md b/README_en.md index 3edc398..eeaadec 100644 --- a/README_en.md +++ b/README_en.md @@ -65,7 +65,7 @@ npm install > > The gummy subtitle engine in this project is a Python subprocess, packaged into an executable file using pyinstaller. The code for running the subtitle engine subprocess is in the `src\main\utils\engine.ts` file. -First, enter the `python-subprocess` folder and execute the following command to create a virtual environment: +First, enter the `caption-engine` folder and execute the following command to create a virtual environment: ```bash python -m venv subenv @@ -92,7 +92,7 @@ Then build the project using `pyinstaller`: pyinstaller --onefile main-gummy.py ``` -At this point, the project is built. You can find the corresponding executable file in the `python-subprocess/dist` folder. You can proceed with further operations. +At this point, the project is built. You can find the corresponding executable file in the `caption-engine/dist` folder. You can proceed with further operations. ### Run the Project diff --git a/assets/engine-manual_zh.md b/assets/engine-manual_zh.md index b19aeb8..6df6a22 100644 --- a/assets/engine-manual_zh.md +++ b/assets/engine-manual_zh.md @@ -39,7 +39,7 @@ export interface CaptionItem { 如果使用 python 语言,可以参考以下方式将数据传递给主程序: ```python -# python-subprocess\audio2text\gummy.py +# caption-engine\audio2text\gummy.py ... def send_to_node(self, data): """ @@ -84,4 +84,4 @@ export interface CaptionItem { ## 参考代码 -本项目 `python-subprocess` 文件夹下的 `main-gummy.py` 文件为默认字幕引擎的入口代码。`src\main\utils\engine.ts` 为服务端获取字幕引擎数据和进行处理的代码。可以根据需要阅读了解字幕引擎的实现细节和完整运行过程。 \ No newline at end of file +本项目 `caption-engine` 文件夹下的 `main-gummy.py` 文件为默认字幕引擎的入口代码。`src\main\utils\engine.ts` 为服务端获取字幕引擎数据和进行处理的代码。可以根据需要阅读了解字幕引擎的实现细节和完整运行过程。 diff --git a/python-subprocess/audio2text/gummy.py b/caption-engine/audio2text/gummy.py similarity index 96% rename from python-subprocess/audio2text/gummy.py rename to caption-engine/audio2text/gummy.py index 9f420f8..1d7de96 100644 --- a/python-subprocess/audio2text/gummy.py +++ b/caption-engine/audio2text/gummy.py @@ -2,7 +2,7 @@ from dashscope.audio.asr import ( TranslationRecognizerCallback, TranscriptionResult, TranslationResult, - TranslationRecognizerRealtime + TranslationRecognizerRealtime ) from datetime import datetime import json @@ -17,11 +17,13 @@ class Callback(TranslationRecognizerCallback): self.usage = 0 self.cur_id = -1 self.time_str = '' - + def on_open(self) -> None: + # print("on_open") pass def on_close(self) -> None: + # print("on_close") pass def on_event( @@ -44,11 +46,11 @@ class Callback(TranslationRecognizerCallback): caption['time_s'] = self.time_str caption['time_t'] = datetime.now().strftime('%H:%M:%S') caption['translation'] = "" - + if translation_result is not None: lang = translation_result.get_language_list()[0] caption['translation'] = translation_result.get_translation(lang).text - + if usage: self.usage += usage['duration'] diff --git a/python-subprocess/main-gummy.py b/caption-engine/main-gummy.py similarity index 89% rename from python-subprocess/main-gummy.py rename to caption-engine/main-gummy.py index 5b3903e..df648e3 100644 --- a/python-subprocess/main-gummy.py +++ b/caption-engine/main-gummy.py @@ -27,7 +27,11 @@ def convert_audio_to_text(s_lang, t_lang, audio_type): if not stream.stream: continue data = stream.stream.read(stream.CHUNK) data = mergeStreamChannels(data, stream.CHANNELS) - gummy.translator.send_audio_frame(data) + try: + gummy.translator.send_audio_frame(data) + except: + gummy.translator.start() + gummy.translator.send_audio_frame(data) except KeyboardInterrupt: stream.closeStream() gummy.translator.stop() diff --git a/python-subprocess/main-gummy.spec b/caption-engine/main-gummy.spec similarity index 100% rename from python-subprocess/main-gummy.spec rename to caption-engine/main-gummy.spec diff --git a/caption-engine/requirements.txt b/caption-engine/requirements.txt new file mode 100644 index 0000000..eefe30b --- /dev/null +++ b/caption-engine/requirements.txt @@ -0,0 +1,5 @@ +dashscope==1.23.5 +numpy==2.2.6 +PyAudio==0.2.14 +PyAudioWPatch==0.2.12.7 # Windows only +pyinstaller==6.14.1 diff --git a/python-subprocess/sysaudio/linux.py b/caption-engine/sysaudio/linux.py similarity index 100% rename from python-subprocess/sysaudio/linux.py rename to caption-engine/sysaudio/linux.py diff --git a/python-subprocess/sysaudio/win.py b/caption-engine/sysaudio/win.py similarity index 98% rename from python-subprocess/sysaudio/win.py rename to caption-engine/sysaudio/win.py index f7c7af6..bb7b121 100644 --- a/python-subprocess/sysaudio/win.py +++ b/caption-engine/sysaudio/win.py @@ -35,7 +35,7 @@ def getDefaultLoopbackDevice(mic: pyaudio.PyAudio, info = True)->dict: print("Run `python -m pyaudiowpatch` to check available devices.") print("Exiting...") exit() - + if(info): print(f"Output Stream Device: #{default_speaker['index']} {default_speaker['name']}") return default_speaker @@ -64,7 +64,7 @@ def mergeStreamChannels(data, channels): class AudioStream: """ 获取系统音频流 - + 参数: audio_type: (默认)0-系统音频输出流,1-系统音频输入流 """ @@ -116,7 +116,7 @@ class AudioStream: input_device_index = self.INDEX ) return self.stream - + def closeStream(self): """ 关闭系统音频输出流 @@ -124,4 +124,4 @@ class AudioStream: if self.stream is None: return self.stream.stop_stream() self.stream.close() - self.stream = None \ No newline at end of file + self.stream = None diff --git a/electron-builder.yml b/electron-builder.yml index ef543cc..fc3a4cc 100644 --- a/electron-builder.yml +++ b/electron-builder.yml @@ -10,8 +10,8 @@ files: - '!{.env,.env.*,.npmrc,pnpm-lock.yaml}' - '!{tsconfig.json,tsconfig.node.json,tsconfig.web.json}' extraResources: - from: ./python-subprocess/dist/main-gummy.exe - to: ./python-subprocess/dist/main-gummy.exe + from: ./caption-engine/dist/main-gummy.exe + to: ./caption-engine/dist/main-gummy.exe asarUnpack: - resources/** win: diff --git a/python-prototype/gummy.ipynb b/python-prototype/gummy.ipynb deleted file mode 100644 index a105f29..0000000 --- a/python-prototype/gummy.ipynb +++ /dev/null @@ -1,221 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from dashscope.audio.asr import *\n", - "import pyaudiowpatch as pyaudio\n", - "import numpy as np\n", - "\n", - "\n", - "def getDefaultSpeakers(mic: pyaudio.PyAudio, info = True):\n", - " \"\"\"\n", - " 获取默认的系统音频输出的回环设备\n", - " Args:\n", - " mic (pyaudio.PyAudio): pyaudio对象\n", - " info (bool, optional): 是否打印设备信息. Defaults to True.\n", - "\n", - " Returns:\n", - " dict: 统音频输出的回环设备\n", - " \"\"\"\n", - " try:\n", - " WASAPI_info = mic.get_host_api_info_by_type(pyaudio.paWASAPI)\n", - " except OSError:\n", - " print(\"Looks like WASAPI is not available on the system. Exiting...\")\n", - " exit()\n", - "\n", - " default_speaker = mic.get_device_info_by_index(WASAPI_info[\"defaultOutputDevice\"])\n", - " if(info): print(\"wasapi_info:\\n\", WASAPI_info, \"\\n\")\n", - " if(info): print(\"default_speaker:\\n\", default_speaker, \"\\n\")\n", - "\n", - " if not default_speaker[\"isLoopbackDevice\"]:\n", - " for loopback in mic.get_loopback_device_info_generator():\n", - " if default_speaker[\"name\"] in loopback[\"name\"]:\n", - " default_speaker = loopback\n", - " if(info): print(\"Using loopback device:\\n\", default_speaker, \"\\n\")\n", - " break\n", - " else:\n", - " print(\"Default loopback output device not found.\")\n", - " print(\"Run `python -m pyaudiowpatch` to check available devices.\")\n", - " print(\"Exiting...\")\n", - " exit()\n", - " \n", - " if(info): print(f\"Recording Device: #{default_speaker['index']} {default_speaker['name']}\")\n", - " return default_speaker\n", - "\n", - "\n", - "class Callback(TranslationRecognizerCallback):\n", - " \"\"\"\n", - " 语音大模型流式传输回调对象\n", - " \"\"\"\n", - " def __init__(self):\n", - " super().__init__()\n", - " self.usage = 0\n", - " self.sentences = []\n", - " self.translations = []\n", - " \n", - " def on_open(self) -> None:\n", - " print(\"\\n流式翻译开始...\\n\")\n", - "\n", - " def on_close(self) -> None:\n", - " print(f\"\\nTokens消耗:{self.usage}\")\n", - " print(f\"流式翻译结束...\\n\")\n", - " for i in range(len(self.sentences)):\n", - " print(f\"\\n{self.sentences[i]}\\n{self.translations[i]}\\n\")\n", - "\n", - " def on_event(\n", - " self,\n", - " request_id,\n", - " transcription_result: TranscriptionResult,\n", - " translation_result: TranslationResult,\n", - " usage\n", - " ) -> None:\n", - " if transcription_result is not None:\n", - " id = transcription_result.sentence_id\n", - " text = transcription_result.text\n", - " if transcription_result.stash is not None:\n", - " stash = transcription_result.stash.text\n", - " else:\n", - " stash = \"\"\n", - " print(f\"#{id}: {text}{stash}\")\n", - " if usage: self.sentences.append(text)\n", - " \n", - " if translation_result is not None:\n", - " lang = translation_result.get_language_list()[0]\n", - " text = translation_result.get_translation(lang).text\n", - " if translation_result.get_translation(lang).stash is not None:\n", - " stash = translation_result.get_translation(lang).stash.text\n", - " else:\n", - " stash = \"\"\n", - " print(f\"#{lang}: {text}{stash}\")\n", - " if usage: self.translations.append(text)\n", - " \n", - " if usage: self.usage += usage['duration']" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "采样输入设备:\n", - " - 序号:37\n", - " - 名称:耳机 (HUAWEI FreeLace 活力版) [Loopback]\n", - " - 最大输入通道数:2\n", - " - 默认低输入延迟:0.003s\n", - " - 默认高输入延迟:0.01s\n", - " - 默认采样率:44100.0Hz\n", - " - 是否回环设备:True\n", - "\n", - "音频样本块大小:4410\n", - "样本位宽:2\n", - "音频数据格式:8\n", - "音频通道数:2\n", - "音频采样率:44100\n", - "\n" - ] - } - ], - "source": [ - "mic = pyaudio.PyAudio()\n", - "default_speaker = getDefaultSpeakers(mic, False)\n", - "\n", - "SAMP_WIDTH = pyaudio.get_sample_size(pyaudio.paInt16)\n", - "FORMAT = pyaudio.paInt16\n", - "CHANNELS = default_speaker[\"maxInputChannels\"]\n", - "RATE = int(default_speaker[\"defaultSampleRate\"])\n", - "CHUNK = RATE // 10\n", - "INDEX = default_speaker[\"index\"]\n", - "\n", - "dev_info = f\"\"\"\n", - "采样输入设备:\n", - " - 序号:{default_speaker['index']}\n", - " - 名称:{default_speaker['name']}\n", - " - 最大输入通道数:{default_speaker['maxInputChannels']}\n", - " - 默认低输入延迟:{default_speaker['defaultLowInputLatency']}s\n", - " - 默认高输入延迟:{default_speaker['defaultHighInputLatency']}s\n", - " - 默认采样率:{default_speaker['defaultSampleRate']}Hz\n", - " - 是否回环设备:{default_speaker['isLoopbackDevice']}\n", - "\n", - "音频样本块大小:{CHUNK}\n", - "样本位宽:{SAMP_WIDTH}\n", - "音频数据格式:{FORMAT}\n", - "音频通道数:{CHANNELS}\n", - "音频采样率:{RATE}\n", - "\"\"\"\n", - "print(dev_info)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "RECORD_SECONDS = 20 # 监听时长(s)\n", - "\n", - "stream = mic.open(\n", - " format = FORMAT,\n", - " channels = CHANNELS,\n", - " rate = RATE,\n", - " input = True,\n", - " input_device_index = INDEX\n", - ")\n", - "translator = TranslationRecognizerRealtime(\n", - " model = \"gummy-realtime-v1\",\n", - " format = \"pcm\",\n", - " sample_rate = RATE,\n", - " transcription_enabled = True,\n", - " translation_enabled = True,\n", - " source_language = \"ja\",\n", - " translation_target_languages = [\"zh\"],\n", - " callback = Callback()\n", - ")\n", - "translator.start()\n", - "\n", - "for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):\n", - " data = stream.read(CHUNK)\n", - " data_np = np.frombuffer(data, dtype=np.int16)\n", - " data_np_r = data_np.reshape(-1, CHANNELS)\n", - " print(data_np_r.shape)\n", - " mono_data = np.mean(data_np_r.astype(np.float32), axis=1)\n", - " mono_data = mono_data.astype(np.int16)\n", - " mono_data_bytes = mono_data.tobytes()\n", - " translator.send_audio_frame(mono_data_bytes)\n", - "\n", - "translator.stop()\n", - "stream.stop_stream()\n", - "stream.close()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "mystd", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/python-prototype/requirements.txt b/python-prototype/requirements.txt deleted file mode 100644 index 12dede0..0000000 --- a/python-prototype/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -numpy -dashscope -pyaudio -pyaudiowpatch \ No newline at end of file diff --git a/python-subprocess/requirements.txt b/python-subprocess/requirements.txt deleted file mode 100644 index d888ae587250a5bc2a677efdfe931c236466b60e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 214 zcmZXOF$%&!5Jg{|Qv~dTi5e>lZ(x@?7$HL3h%tzlSKlsH$}+PsZ~mYEes#1=^wyD) zD<^iol7fnreM3fI<|2D0RwY(YOAgNV(vGw0s9b(&$@-5s?zh~%)c@>8&n-C%tI;dy fIPx%6?jIPulcfZaO?tG>2Gib>Q>{K;j6LH6AB!Rv diff --git a/src/main/utils/CaptionEngine.ts b/src/main/utils/CaptionEngine.ts index a7c977a..4581e4c 100644 --- a/src/main/utils/CaptionEngine.ts +++ b/src/main/utils/CaptionEngine.ts @@ -37,13 +37,13 @@ export class CaptionEngine { if (is.dev) { this.appPath = path.join( app.getAppPath(), - 'python-subprocess', 'dist', gummyName + 'caption-engine', 'dist', gummyName ) } else { this.appPath = path.join( process.resourcesPath, - 'python-subprocess', 'dist', gummyName + 'caption-engine', 'dist', gummyName ) } this.command = []