mirror of
https://github.com/HiMeditator/auto-caption.git
synced 2026-02-04 04:14:42 +08:00
refactor(caption): 重构字幕引擎结构、修复字幕引擎空置报错 (#2)
- 修复gummy字幕引擎长时间空置报错的问题 - 将 python-subprocess 文件夹重命名为 caption-engine - 删除未使用的 prototype 代码
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -6,4 +6,4 @@ out
|
|||||||
*.log*
|
*.log*
|
||||||
__pycache__
|
__pycache__
|
||||||
subenv
|
subenv
|
||||||
python-subprocess/build
|
caption-engine/build
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ npm install
|
|||||||
>
|
>
|
||||||
> 本项目的 gummy 字幕引擎是一个 python 子程序,通过 pyinstaller 打包为可执行文件。 运行字幕引擎子程序的代码在 `src\main\utils\engine.ts` 文件中。
|
> 本项目的 gummy 字幕引擎是一个 python 子程序,通过 pyinstaller 打包为可执行文件。 运行字幕引擎子程序的代码在 `src\main\utils\engine.ts` 文件中。
|
||||||
|
|
||||||
首先进入 `python-subprocess` 文件夹,执行如下指令创建虚拟环境:
|
首先进入 `caption-engine` 文件夹,执行如下指令创建虚拟环境:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python -m venv subenv
|
python -m venv subenv
|
||||||
@@ -86,7 +86,7 @@ pip install -r requirements.txt
|
|||||||
pyinstaller --onefile main-gummy.py
|
pyinstaller --onefile main-gummy.py
|
||||||
```
|
```
|
||||||
|
|
||||||
此时项目构建完成,在进入 `python-subprocess/dist` 文件夹可见对应的可执行文件。即可进行后续操作。
|
此时项目构建完成,在进入 `caption-engine/dist` 文件夹可见对应的可执行文件。即可进行后续操作。
|
||||||
|
|
||||||
### 运行项目
|
### 运行项目
|
||||||
|
|
||||||
|
|||||||
@@ -65,7 +65,7 @@ npm install
|
|||||||
>
|
>
|
||||||
> The gummy subtitle engine in this project is a Python subprocess, packaged into an executable file using pyinstaller. The code for running the subtitle engine subprocess is in the `src\main\utils\engine.ts` file.
|
> The gummy subtitle engine in this project is a Python subprocess, packaged into an executable file using pyinstaller. The code for running the subtitle engine subprocess is in the `src\main\utils\engine.ts` file.
|
||||||
|
|
||||||
First, enter the `python-subprocess` folder and execute the following command to create a virtual environment:
|
First, enter the `caption-engine` folder and execute the following command to create a virtual environment:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python -m venv subenv
|
python -m venv subenv
|
||||||
@@ -92,7 +92,7 @@ Then build the project using `pyinstaller`:
|
|||||||
pyinstaller --onefile main-gummy.py
|
pyinstaller --onefile main-gummy.py
|
||||||
```
|
```
|
||||||
|
|
||||||
At this point, the project is built. You can find the corresponding executable file in the `python-subprocess/dist` folder. You can proceed with further operations.
|
At this point, the project is built. You can find the corresponding executable file in the `caption-engine/dist` folder. You can proceed with further operations.
|
||||||
|
|
||||||
### Run the Project
|
### Run the Project
|
||||||
|
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ export interface CaptionItem {
|
|||||||
如果使用 python 语言,可以参考以下方式将数据传递给主程序:
|
如果使用 python 语言,可以参考以下方式将数据传递给主程序:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# python-subprocess\audio2text\gummy.py
|
# caption-engine\audio2text\gummy.py
|
||||||
...
|
...
|
||||||
def send_to_node(self, data):
|
def send_to_node(self, data):
|
||||||
"""
|
"""
|
||||||
@@ -84,4 +84,4 @@ export interface CaptionItem {
|
|||||||
|
|
||||||
## 参考代码
|
## 参考代码
|
||||||
|
|
||||||
本项目 `python-subprocess` 文件夹下的 `main-gummy.py` 文件为默认字幕引擎的入口代码。`src\main\utils\engine.ts` 为服务端获取字幕引擎数据和进行处理的代码。可以根据需要阅读了解字幕引擎的实现细节和完整运行过程。
|
本项目 `caption-engine` 文件夹下的 `main-gummy.py` 文件为默认字幕引擎的入口代码。`src\main\utils\engine.ts` 为服务端获取字幕引擎数据和进行处理的代码。可以根据需要阅读了解字幕引擎的实现细节和完整运行过程。
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ from dashscope.audio.asr import (
|
|||||||
TranslationRecognizerCallback,
|
TranslationRecognizerCallback,
|
||||||
TranscriptionResult,
|
TranscriptionResult,
|
||||||
TranslationResult,
|
TranslationResult,
|
||||||
TranslationRecognizerRealtime
|
TranslationRecognizerRealtime
|
||||||
)
|
)
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import json
|
import json
|
||||||
@@ -17,11 +17,13 @@ class Callback(TranslationRecognizerCallback):
|
|||||||
self.usage = 0
|
self.usage = 0
|
||||||
self.cur_id = -1
|
self.cur_id = -1
|
||||||
self.time_str = ''
|
self.time_str = ''
|
||||||
|
|
||||||
def on_open(self) -> None:
|
def on_open(self) -> None:
|
||||||
|
# print("on_open")
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def on_close(self) -> None:
|
def on_close(self) -> None:
|
||||||
|
# print("on_close")
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def on_event(
|
def on_event(
|
||||||
@@ -44,11 +46,11 @@ class Callback(TranslationRecognizerCallback):
|
|||||||
caption['time_s'] = self.time_str
|
caption['time_s'] = self.time_str
|
||||||
caption['time_t'] = datetime.now().strftime('%H:%M:%S')
|
caption['time_t'] = datetime.now().strftime('%H:%M:%S')
|
||||||
caption['translation'] = ""
|
caption['translation'] = ""
|
||||||
|
|
||||||
if translation_result is not None:
|
if translation_result is not None:
|
||||||
lang = translation_result.get_language_list()[0]
|
lang = translation_result.get_language_list()[0]
|
||||||
caption['translation'] = translation_result.get_translation(lang).text
|
caption['translation'] = translation_result.get_translation(lang).text
|
||||||
|
|
||||||
if usage:
|
if usage:
|
||||||
self.usage += usage['duration']
|
self.usage += usage['duration']
|
||||||
|
|
||||||
@@ -27,7 +27,11 @@ def convert_audio_to_text(s_lang, t_lang, audio_type):
|
|||||||
if not stream.stream: continue
|
if not stream.stream: continue
|
||||||
data = stream.stream.read(stream.CHUNK)
|
data = stream.stream.read(stream.CHUNK)
|
||||||
data = mergeStreamChannels(data, stream.CHANNELS)
|
data = mergeStreamChannels(data, stream.CHANNELS)
|
||||||
gummy.translator.send_audio_frame(data)
|
try:
|
||||||
|
gummy.translator.send_audio_frame(data)
|
||||||
|
except:
|
||||||
|
gummy.translator.start()
|
||||||
|
gummy.translator.send_audio_frame(data)
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
stream.closeStream()
|
stream.closeStream()
|
||||||
gummy.translator.stop()
|
gummy.translator.stop()
|
||||||
5
caption-engine/requirements.txt
Normal file
5
caption-engine/requirements.txt
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
dashscope==1.23.5
|
||||||
|
numpy==2.2.6
|
||||||
|
PyAudio==0.2.14
|
||||||
|
PyAudioWPatch==0.2.12.7 # Windows only
|
||||||
|
pyinstaller==6.14.1
|
||||||
@@ -35,7 +35,7 @@ def getDefaultLoopbackDevice(mic: pyaudio.PyAudio, info = True)->dict:
|
|||||||
print("Run `python -m pyaudiowpatch` to check available devices.")
|
print("Run `python -m pyaudiowpatch` to check available devices.")
|
||||||
print("Exiting...")
|
print("Exiting...")
|
||||||
exit()
|
exit()
|
||||||
|
|
||||||
if(info): print(f"Output Stream Device: #{default_speaker['index']} {default_speaker['name']}")
|
if(info): print(f"Output Stream Device: #{default_speaker['index']} {default_speaker['name']}")
|
||||||
return default_speaker
|
return default_speaker
|
||||||
|
|
||||||
@@ -64,7 +64,7 @@ def mergeStreamChannels(data, channels):
|
|||||||
class AudioStream:
|
class AudioStream:
|
||||||
"""
|
"""
|
||||||
获取系统音频流
|
获取系统音频流
|
||||||
|
|
||||||
参数:
|
参数:
|
||||||
audio_type: (默认)0-系统音频输出流,1-系统音频输入流
|
audio_type: (默认)0-系统音频输出流,1-系统音频输入流
|
||||||
"""
|
"""
|
||||||
@@ -116,7 +116,7 @@ class AudioStream:
|
|||||||
input_device_index = self.INDEX
|
input_device_index = self.INDEX
|
||||||
)
|
)
|
||||||
return self.stream
|
return self.stream
|
||||||
|
|
||||||
def closeStream(self):
|
def closeStream(self):
|
||||||
"""
|
"""
|
||||||
关闭系统音频输出流
|
关闭系统音频输出流
|
||||||
@@ -124,4 +124,4 @@ class AudioStream:
|
|||||||
if self.stream is None: return
|
if self.stream is None: return
|
||||||
self.stream.stop_stream()
|
self.stream.stop_stream()
|
||||||
self.stream.close()
|
self.stream.close()
|
||||||
self.stream = None
|
self.stream = None
|
||||||
@@ -10,8 +10,8 @@ files:
|
|||||||
- '!{.env,.env.*,.npmrc,pnpm-lock.yaml}'
|
- '!{.env,.env.*,.npmrc,pnpm-lock.yaml}'
|
||||||
- '!{tsconfig.json,tsconfig.node.json,tsconfig.web.json}'
|
- '!{tsconfig.json,tsconfig.node.json,tsconfig.web.json}'
|
||||||
extraResources:
|
extraResources:
|
||||||
from: ./python-subprocess/dist/main-gummy.exe
|
from: ./caption-engine/dist/main-gummy.exe
|
||||||
to: ./python-subprocess/dist/main-gummy.exe
|
to: ./caption-engine/dist/main-gummy.exe
|
||||||
asarUnpack:
|
asarUnpack:
|
||||||
- resources/**
|
- resources/**
|
||||||
win:
|
win:
|
||||||
|
|||||||
@@ -1,221 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 1,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from dashscope.audio.asr import *\n",
|
|
||||||
"import pyaudiowpatch as pyaudio\n",
|
|
||||||
"import numpy as np\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"def getDefaultSpeakers(mic: pyaudio.PyAudio, info = True):\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" 获取默认的系统音频输出的回环设备\n",
|
|
||||||
" Args:\n",
|
|
||||||
" mic (pyaudio.PyAudio): pyaudio对象\n",
|
|
||||||
" info (bool, optional): 是否打印设备信息. Defaults to True.\n",
|
|
||||||
"\n",
|
|
||||||
" Returns:\n",
|
|
||||||
" dict: 统音频输出的回环设备\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" try:\n",
|
|
||||||
" WASAPI_info = mic.get_host_api_info_by_type(pyaudio.paWASAPI)\n",
|
|
||||||
" except OSError:\n",
|
|
||||||
" print(\"Looks like WASAPI is not available on the system. Exiting...\")\n",
|
|
||||||
" exit()\n",
|
|
||||||
"\n",
|
|
||||||
" default_speaker = mic.get_device_info_by_index(WASAPI_info[\"defaultOutputDevice\"])\n",
|
|
||||||
" if(info): print(\"wasapi_info:\\n\", WASAPI_info, \"\\n\")\n",
|
|
||||||
" if(info): print(\"default_speaker:\\n\", default_speaker, \"\\n\")\n",
|
|
||||||
"\n",
|
|
||||||
" if not default_speaker[\"isLoopbackDevice\"]:\n",
|
|
||||||
" for loopback in mic.get_loopback_device_info_generator():\n",
|
|
||||||
" if default_speaker[\"name\"] in loopback[\"name\"]:\n",
|
|
||||||
" default_speaker = loopback\n",
|
|
||||||
" if(info): print(\"Using loopback device:\\n\", default_speaker, \"\\n\")\n",
|
|
||||||
" break\n",
|
|
||||||
" else:\n",
|
|
||||||
" print(\"Default loopback output device not found.\")\n",
|
|
||||||
" print(\"Run `python -m pyaudiowpatch` to check available devices.\")\n",
|
|
||||||
" print(\"Exiting...\")\n",
|
|
||||||
" exit()\n",
|
|
||||||
" \n",
|
|
||||||
" if(info): print(f\"Recording Device: #{default_speaker['index']} {default_speaker['name']}\")\n",
|
|
||||||
" return default_speaker\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"class Callback(TranslationRecognizerCallback):\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" 语音大模型流式传输回调对象\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" def __init__(self):\n",
|
|
||||||
" super().__init__()\n",
|
|
||||||
" self.usage = 0\n",
|
|
||||||
" self.sentences = []\n",
|
|
||||||
" self.translations = []\n",
|
|
||||||
" \n",
|
|
||||||
" def on_open(self) -> None:\n",
|
|
||||||
" print(\"\\n流式翻译开始...\\n\")\n",
|
|
||||||
"\n",
|
|
||||||
" def on_close(self) -> None:\n",
|
|
||||||
" print(f\"\\nTokens消耗:{self.usage}\")\n",
|
|
||||||
" print(f\"流式翻译结束...\\n\")\n",
|
|
||||||
" for i in range(len(self.sentences)):\n",
|
|
||||||
" print(f\"\\n{self.sentences[i]}\\n{self.translations[i]}\\n\")\n",
|
|
||||||
"\n",
|
|
||||||
" def on_event(\n",
|
|
||||||
" self,\n",
|
|
||||||
" request_id,\n",
|
|
||||||
" transcription_result: TranscriptionResult,\n",
|
|
||||||
" translation_result: TranslationResult,\n",
|
|
||||||
" usage\n",
|
|
||||||
" ) -> None:\n",
|
|
||||||
" if transcription_result is not None:\n",
|
|
||||||
" id = transcription_result.sentence_id\n",
|
|
||||||
" text = transcription_result.text\n",
|
|
||||||
" if transcription_result.stash is not None:\n",
|
|
||||||
" stash = transcription_result.stash.text\n",
|
|
||||||
" else:\n",
|
|
||||||
" stash = \"\"\n",
|
|
||||||
" print(f\"#{id}: {text}{stash}\")\n",
|
|
||||||
" if usage: self.sentences.append(text)\n",
|
|
||||||
" \n",
|
|
||||||
" if translation_result is not None:\n",
|
|
||||||
" lang = translation_result.get_language_list()[0]\n",
|
|
||||||
" text = translation_result.get_translation(lang).text\n",
|
|
||||||
" if translation_result.get_translation(lang).stash is not None:\n",
|
|
||||||
" stash = translation_result.get_translation(lang).stash.text\n",
|
|
||||||
" else:\n",
|
|
||||||
" stash = \"\"\n",
|
|
||||||
" print(f\"#{lang}: {text}{stash}\")\n",
|
|
||||||
" if usage: self.translations.append(text)\n",
|
|
||||||
" \n",
|
|
||||||
" if usage: self.usage += usage['duration']"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 2,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"\n",
|
|
||||||
"采样输入设备:\n",
|
|
||||||
" - 序号:37\n",
|
|
||||||
" - 名称:耳机 (HUAWEI FreeLace 活力版) [Loopback]\n",
|
|
||||||
" - 最大输入通道数:2\n",
|
|
||||||
" - 默认低输入延迟:0.003s\n",
|
|
||||||
" - 默认高输入延迟:0.01s\n",
|
|
||||||
" - 默认采样率:44100.0Hz\n",
|
|
||||||
" - 是否回环设备:True\n",
|
|
||||||
"\n",
|
|
||||||
"音频样本块大小:4410\n",
|
|
||||||
"样本位宽:2\n",
|
|
||||||
"音频数据格式:8\n",
|
|
||||||
"音频通道数:2\n",
|
|
||||||
"音频采样率:44100\n",
|
|
||||||
"\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"mic = pyaudio.PyAudio()\n",
|
|
||||||
"default_speaker = getDefaultSpeakers(mic, False)\n",
|
|
||||||
"\n",
|
|
||||||
"SAMP_WIDTH = pyaudio.get_sample_size(pyaudio.paInt16)\n",
|
|
||||||
"FORMAT = pyaudio.paInt16\n",
|
|
||||||
"CHANNELS = default_speaker[\"maxInputChannels\"]\n",
|
|
||||||
"RATE = int(default_speaker[\"defaultSampleRate\"])\n",
|
|
||||||
"CHUNK = RATE // 10\n",
|
|
||||||
"INDEX = default_speaker[\"index\"]\n",
|
|
||||||
"\n",
|
|
||||||
"dev_info = f\"\"\"\n",
|
|
||||||
"采样输入设备:\n",
|
|
||||||
" - 序号:{default_speaker['index']}\n",
|
|
||||||
" - 名称:{default_speaker['name']}\n",
|
|
||||||
" - 最大输入通道数:{default_speaker['maxInputChannels']}\n",
|
|
||||||
" - 默认低输入延迟:{default_speaker['defaultLowInputLatency']}s\n",
|
|
||||||
" - 默认高输入延迟:{default_speaker['defaultHighInputLatency']}s\n",
|
|
||||||
" - 默认采样率:{default_speaker['defaultSampleRate']}Hz\n",
|
|
||||||
" - 是否回环设备:{default_speaker['isLoopbackDevice']}\n",
|
|
||||||
"\n",
|
|
||||||
"音频样本块大小:{CHUNK}\n",
|
|
||||||
"样本位宽:{SAMP_WIDTH}\n",
|
|
||||||
"音频数据格式:{FORMAT}\n",
|
|
||||||
"音频通道数:{CHANNELS}\n",
|
|
||||||
"音频采样率:{RATE}\n",
|
|
||||||
"\"\"\"\n",
|
|
||||||
"print(dev_info)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"RECORD_SECONDS = 20 # 监听时长(s)\n",
|
|
||||||
"\n",
|
|
||||||
"stream = mic.open(\n",
|
|
||||||
" format = FORMAT,\n",
|
|
||||||
" channels = CHANNELS,\n",
|
|
||||||
" rate = RATE,\n",
|
|
||||||
" input = True,\n",
|
|
||||||
" input_device_index = INDEX\n",
|
|
||||||
")\n",
|
|
||||||
"translator = TranslationRecognizerRealtime(\n",
|
|
||||||
" model = \"gummy-realtime-v1\",\n",
|
|
||||||
" format = \"pcm\",\n",
|
|
||||||
" sample_rate = RATE,\n",
|
|
||||||
" transcription_enabled = True,\n",
|
|
||||||
" translation_enabled = True,\n",
|
|
||||||
" source_language = \"ja\",\n",
|
|
||||||
" translation_target_languages = [\"zh\"],\n",
|
|
||||||
" callback = Callback()\n",
|
|
||||||
")\n",
|
|
||||||
"translator.start()\n",
|
|
||||||
"\n",
|
|
||||||
"for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):\n",
|
|
||||||
" data = stream.read(CHUNK)\n",
|
|
||||||
" data_np = np.frombuffer(data, dtype=np.int16)\n",
|
|
||||||
" data_np_r = data_np.reshape(-1, CHANNELS)\n",
|
|
||||||
" print(data_np_r.shape)\n",
|
|
||||||
" mono_data = np.mean(data_np_r.astype(np.float32), axis=1)\n",
|
|
||||||
" mono_data = mono_data.astype(np.int16)\n",
|
|
||||||
" mono_data_bytes = mono_data.tobytes()\n",
|
|
||||||
" translator.send_audio_frame(mono_data_bytes)\n",
|
|
||||||
"\n",
|
|
||||||
"translator.stop()\n",
|
|
||||||
"stream.stop_stream()\n",
|
|
||||||
"stream.close()"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "mystd",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.10.12"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 2
|
|
||||||
}
|
|
||||||
@@ -1,4 +0,0 @@
|
|||||||
numpy
|
|
||||||
dashscope
|
|
||||||
pyaudio
|
|
||||||
pyaudiowpatch
|
|
||||||
Binary file not shown.
@@ -37,13 +37,13 @@ export class CaptionEngine {
|
|||||||
if (is.dev) {
|
if (is.dev) {
|
||||||
this.appPath = path.join(
|
this.appPath = path.join(
|
||||||
app.getAppPath(),
|
app.getAppPath(),
|
||||||
'python-subprocess', 'dist', gummyName
|
'caption-engine', 'dist', gummyName
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
this.appPath = path.join(
|
this.appPath = path.join(
|
||||||
process.resourcesPath,
|
process.resourcesPath,
|
||||||
'python-subprocess', 'dist', gummyName
|
'caption-engine', 'dist', gummyName
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
this.command = []
|
this.command = []
|
||||||
|
|||||||
Reference in New Issue
Block a user