diff --git a/.editorconfig b/.editorconfig index 7d97d2b..92d41ba 100644 --- a/.editorconfig +++ b/.editorconfig @@ -10,3 +10,6 @@ trim_trailing_whitespace = true [*.py] indent_size = 4 + +[*.ipynb] +indent_size = 4 diff --git a/.vscode/settings.json b/.vscode/settings.json index 20951e4..cf68942 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -7,5 +7,8 @@ }, "[json]": { "editor.defaultFormatter": "esbenp.prettier-vscode" - } + }, + "python.analysis.extraPaths": [ + "./caption-engine" + ] } diff --git a/assets/media/structure_ja.png b/assets/media/structure_ja.png index b88e61b..537b251 100644 Binary files a/assets/media/structure_ja.png and b/assets/media/structure_ja.png differ diff --git a/assets/structure.pptx b/assets/structure.pptx index 99345a9..48ed495 100644 Binary files a/assets/structure.pptx and b/assets/structure.pptx differ diff --git a/caption-engine/audio2text/__init__.py b/caption-engine/audio2text/__init__.py new file mode 100644 index 0000000..6192084 --- /dev/null +++ b/caption-engine/audio2text/__init__.py @@ -0,0 +1,2 @@ +from dashscope.common.error import InvalidParameter +from .gummy import GummyTranslator diff --git a/caption-engine/audio2text/gummy.py b/caption-engine/audio2text/gummy.py index 1d7de96..949039b 100644 --- a/caption-engine/audio2text/gummy.py +++ b/caption-engine/audio2text/gummy.py @@ -69,6 +69,14 @@ class Callback(TranslationRecognizerCallback): print(f"Error sending data to Node.js: {e}", file=sys.stderr) class GummyTranslator: + """ + 使用 Gummy 引擎流式处理的音频数据,并在标准输出中输出与 Auto Caption 软件可读取的 JSON 字符串数据 + + 初始化参数: + rate: 音频采样率 + source: 源语言代码字符串(zh, en, ja 等) + target: 目标语言代码字符串(zh, en, ja 等) + """ def __init__(self, rate, source, target): self.translator = TranslationRecognizerRealtime( model = "gummy-realtime-v1", @@ -80,3 +88,15 @@ class GummyTranslator: translation_target_languages = [target], callback = Callback() ) + + def start(self): + """启动 Gummy 引擎""" + self.translator.start() + + def send_audio_frame(self, data): + """发送音频帧""" + self.translator.send_audio_frame(data) + + def stop(self): + """停止 Gummy 引擎""" + self.translator.stop() diff --git a/caption-engine/audioprcs/__init__.py b/caption-engine/audioprcs/__init__.py new file mode 100644 index 0000000..422603b --- /dev/null +++ b/caption-engine/audioprcs/__init__.py @@ -0,0 +1 @@ +from .streamchnl import mergeStreamChannels diff --git a/caption-engine/audioprcs/streamchnl.py b/caption-engine/audioprcs/streamchnl.py new file mode 100644 index 0000000..08f3184 --- /dev/null +++ b/caption-engine/audioprcs/streamchnl.py @@ -0,0 +1,22 @@ +import numpy as np + +def mergeStreamChannels(data, channels): + """ + 将当前多通道流数据合并为单通道流数据 + + Args: + data: 多通道数据 + channels: 通道数 + + Returns: + mono_data_bytes: 单通道数据 + """ + # (length * channels,) + data_np = np.frombuffer(data, dtype=np.int16) + # (length, channels) + data_np_r = data_np.reshape(-1, channels) + # (length,) + mono_data = np.mean(data_np_r.astype(np.float32), axis=1) + mono_data = mono_data.astype(np.int16) + mono_data_bytes = mono_data.tobytes() + return mono_data_bytes diff --git a/caption-engine/main-gummy.py b/caption-engine/main-gummy.py index df648e3..6b3cbd6 100644 --- a/caption-engine/main-gummy.py +++ b/caption-engine/main-gummy.py @@ -1,40 +1,41 @@ import sys +import argparse if sys.platform == 'win32': - from sysaudio.win import AudioStream, mergeStreamChannels + from sysaudio.win import AudioStream elif sys.platform == 'linux': - from sysaudio.linux import AudioStream, mergeStreamChannels + from sysaudio.linux import AudioStream else: raise NotImplementedError(f"Unsupported platform: {sys.platform}") -from audio2text.gummy import GummyTranslator -import sys -import argparse +from audioprcs import mergeStreamChannels +from audio2text import InvalidParameter, GummyTranslator + def convert_audio_to_text(s_lang, t_lang, audio_type): sys.stdout.reconfigure(line_buffering=True) # type: ignore stream = AudioStream(audio_type) - stream.openStream() if t_lang == 'none': gummy = GummyTranslator(stream.RATE, s_lang, None) else: gummy = GummyTranslator(stream.RATE, s_lang, t_lang) - gummy.translator.start() + + stream.openStream() + gummy.start() while True: try: - if not stream.stream: continue - data = stream.stream.read(stream.CHUNK) + data = stream.read_chunk() data = mergeStreamChannels(data, stream.CHANNELS) try: - gummy.translator.send_audio_frame(data) - except: - gummy.translator.start() - gummy.translator.send_audio_frame(data) + gummy.send_audio_frame(data) + except InvalidParameter: + gummy.start() + gummy.send_audio_frame(data) except KeyboardInterrupt: stream.closeStream() - gummy.translator.stop() + gummy.stop() break @@ -47,5 +48,5 @@ if __name__ == "__main__": convert_audio_to_text( args.source_language, args.target_language, - 0 if args.audio_type == '0' else 1 + int(args.audio_type) ) diff --git a/caption-engine/sysaudio/__init__.py b/caption-engine/sysaudio/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/caption-engine/sysaudio/linux.py b/caption-engine/sysaudio/linux.py index 3473515..74ac8a0 100644 --- a/caption-engine/sysaudio/linux.py +++ b/caption-engine/sysaudio/linux.py @@ -1,30 +1,15 @@ import pyaudio -import numpy as np - -def mergeStreamChannels(data, channels): - """ - 将当前多通道流数据合并为单通道流数据 - - Args: - data: 多通道数据 - channels: 通道数 - - Returns: - mono_data_bytes: 单通道数据 - """ - # (length * channels,) - data_np = np.frombuffer(data, dtype=np.int16) - # (length, channels) - data_np_r = data_np.reshape(-1, channels) - # (length,) - mono_data = np.mean(data_np_r.astype(np.float32), axis=1) - mono_data = mono_data.astype(np.int16) - mono_data_bytes = mono_data.tobytes() - return mono_data_bytes class AudioStream: - def __init__(self, audio_type=1): + """ + 获取系统音频流 + + 初始化参数: + audio_type: 0-系统音频输出流(不支持,不会生效),1-系统音频输入流(默认) + chunk_rate: 每秒采集音频块的数量,默认为20 + """ + def __init__(self, audio_type=1, chunk_rate=20): self.audio_type = audio_type self.mic = pyaudio.PyAudio() self.device = self.mic.get_default_input_device_info() @@ -33,7 +18,7 @@ class AudioStream: self.FORMAT = pyaudio.paInt16 self.CHANNELS = self.device["maxInputChannels"] self.RATE = int(self.device["defaultSampleRate"]) - self.CHUNK = self.RATE // 20 + self.CHUNK = self.RATE // chunk_rate self.INDEX = self.device["index"] def printInfo(self): @@ -62,13 +47,20 @@ class AudioStream: if self.stream: return self.stream self.stream = self.mic.open( format = self.FORMAT, - channels = self.CHANNELS, + channels = int(self.CHANNELS), rate = self.RATE, input = True, - input_device_index = self.INDEX + input_device_index = int(self.INDEX) ) return self.stream - + + def read_chunk(self): + """ + 读取音频数据 + """ + if not self.stream: return None + return self.stream.read(self.CHUNK) + def closeStream(self): """ 关闭系统音频输出流 @@ -76,4 +68,4 @@ class AudioStream: if self.stream is None: return self.stream.stop_stream() self.stream.close() - self.stream = None \ No newline at end of file + self.stream = None diff --git a/caption-engine/sysaudio/win.py b/caption-engine/sysaudio/win.py index bb7b121..e5fd6b5 100644 --- a/caption-engine/sysaudio/win.py +++ b/caption-engine/sysaudio/win.py @@ -1,7 +1,6 @@ """获取 Windows 系统音频输出流""" import pyaudiowpatch as pyaudio -import numpy as np def getDefaultLoopbackDevice(mic: pyaudio.PyAudio, info = True)->dict: @@ -40,35 +39,15 @@ def getDefaultLoopbackDevice(mic: pyaudio.PyAudio, info = True)->dict: return default_speaker -def mergeStreamChannels(data, channels): - """ - 将当前多通道流数据合并为单通道流数据 - - Args: - data: 多通道数据 - channels: 通道数 - - Returns: - mono_data_bytes: 单通道数据 - """ - # (length * channels,) - data_np = np.frombuffer(data, dtype=np.int16) - # (length, channels) - data_np_r = data_np.reshape(-1, channels) - # (length,) - mono_data = np.mean(data_np_r.astype(np.float32), axis=1) - mono_data = mono_data.astype(np.int16) - mono_data_bytes = mono_data.tobytes() - return mono_data_bytes - class AudioStream: """ 获取系统音频流 - 参数: - audio_type: (默认)0-系统音频输出流,1-系统音频输入流 + 初始化参数: + audio_type: 0-系统音频输出流(默认),1-系统音频输入流 + chunk_rate: 每秒采集音频块的数量,默认为20 """ - def __init__(self, audio_type=0): + def __init__(self, audio_type=0, chunk_rate=20): self.audio_type = audio_type self.mic = pyaudio.PyAudio() if self.audio_type == 0: @@ -80,7 +59,7 @@ class AudioStream: self.FORMAT = pyaudio.paInt16 self.CHANNELS = self.device["maxInputChannels"] self.RATE = int(self.device["defaultSampleRate"]) - self.CHUNK = self.RATE // 20 + self.CHUNK = self.RATE // chunk_rate self.INDEX = self.device["index"] def printInfo(self): @@ -117,6 +96,13 @@ class AudioStream: ) return self.stream + def read_chunk(self): + """ + 读取音频数据 + """ + if not self.stream: return None + return self.stream.read(self.CHUNK) + def closeStream(self): """ 关闭系统音频输出流 diff --git a/docs/TODO.md b/docs/TODO.md index f82621f..511a829 100644 --- a/docs/TODO.md +++ b/docs/TODO.md @@ -1,6 +1,16 @@ +## 已完成 + - [x] 添加英语和日语语言支持 *2025/07/04* - [x] 添加暗色主题 *2025/07/04* - [x] 优化长字幕显示效果 *2025/07/05* - [x] 修复字幕引擎空置报错的问题 *2025/07/05* -- [ ] 添加更多字幕引擎 -- [ ] 减小软件体积 + +## 待完成 + +- [ ] 添加复制字幕到剪贴板功能 +- [ ] 添加本地字幕引擎 +- [ ] 减小软件不必要的体积 + +## 遥远的未来 + +- [ ] 使用 Tauri 框架重新开发 diff --git a/docs/api-docs/electron-ipc.md b/docs/api-docs/electron-ipc.md index 9de6727..0dd9fc3 100644 --- a/docs/api-docs/electron-ipc.md +++ b/docs/api-docs/electron-ipc.md @@ -20,11 +20,11 @@ ### `both.window.mounted` -**介绍:**前端窗口挂载完毕,请求最新的配置数据 +**介绍:** 前端窗口挂载完毕,请求最新的配置数据 -**发起方:**前端 +**发起方:** 前端 -**接收方:**后端 +**接收方:** 后端 **数据类型:** @@ -33,11 +33,11 @@ ### `control.nativeTheme.get` -**介绍:**前端获取系统当前的主题 +**介绍:** 前端获取系统当前的主题 -**发起方:**前端控制窗口 +**发起方:** 前端控制窗口 -**接收方:**后端控制窗口实例 +**接收方:** 后端控制窗口实例 **数据类型:** @@ -48,242 +48,242 @@ ### `control.uiLanguage.change` -**介绍:**前端修改字界面语言,将修改同步给后端 +**介绍:** 前端修改字界面语言,将修改同步给后端 -**发起方:**前端控制窗口 +**发起方:** 前端控制窗口 -**接收方:**后端控制窗口实例 +**接收方:** 后端控制窗口实例 -**数据类型:**`UILanguage` +**数据类型:** `UILanguage` ### `control.uiTheme.change` -**介绍:**前端修改字界面主题,将修改同步给后端 +**介绍:** 前端修改字界面主题,将修改同步给后端 -**发起方:**前端控制窗口 +**发起方:** 前端控制窗口 -**接收方:**后端控制窗口实例 +**接收方:** 后端控制窗口实例 -**数据类型:**`UITheme` +**数据类型:** `UITheme` ### `control.leftBarWidth.change` -**介绍:**前端修改边栏宽度,将修改同步给后端 +**介绍:** 前端修改边栏宽度,将修改同步给后端 -**发起方:**前端控制窗口 +**发起方:** 前端控制窗口 -**接收方:**后端控制窗口实例 +**接收方:** 后端控制窗口实例 -**数据类型:**`number` +**数据类型:** `number` ### `control.captionLog.clear` -**介绍:**清空字幕记录 +**介绍:** 清空字幕记录 -**发起方:**前端控制窗口 +**发起方:** 前端控制窗口 -**接收方:**后端控制窗口实例 +**接收方:** 后端控制窗口实例 -**数据类型:**无数据 +**数据类型:** 无数据 ### `control.styles.change` -**介绍:**前端修改字幕样式,将修改同步给后端 +**介绍:** 前端修改字幕样式,将修改同步给后端 -**发起方:**前端控制窗口 +**发起方:** 前端控制窗口 -**接收方:**后端控制窗口实例 +**接收方:** 后端控制窗口实例 -**数据类型:**`Styles` +**数据类型:** `Styles` ### `control.styles.reset` -**介绍:**将字幕样式恢复为默认 +**介绍:** 将字幕样式恢复为默认 -**发起方:**前端控制窗口 +**发起方:** 前端控制窗口 -**接收方:**后端控制窗口实例 +**接收方:** 后端控制窗口实例 -**数据类型:**无数据 +**数据类型:** 无数据 ### `control.controls.change` -**介绍:**前端修改了字幕引擎配置,将最新配置发送给后端 +**介绍:** 前端修改了字幕引擎配置,将最新配置发送给后端 -**发起方:**前端控制窗口 +**发起方:** 前端控制窗口 -**接收方:**后端控制窗口实例 +**接收方:** 后端控制窗口实例 -**数据类型:**`Controls` +**数据类型:** `Controls` ### `control.captionWindow.activate` -**介绍:**激活字幕窗口 +**介绍:** 激活字幕窗口 -**发起方:**前端控制窗口 +**发起方:** 前端控制窗口 -**接收方:**后端控制窗口实例 +**接收方:** 后端控制窗口实例 -**数据类型:**无数据 +**数据类型:** 无数据 ### `control.engine.start` -**介绍:**启动字幕引擎 +**介绍:** 启动字幕引擎 -**发起方:**前端控制窗口 +**发起方:** 前端控制窗口 -**接收方:**后端控制窗口实例 +**接收方:** 后端控制窗口实例 -**数据类型:**无数据 +**数据类型:** 无数据 ### `control.engine.stop` -**介绍:**关闭字幕引擎 +**介绍:** 关闭字幕引擎 -**发起方:**前端控制窗口 +**发起方:** 前端控制窗口 -**接收方:**后端控制窗口实例 +**接收方:** 后端控制窗口实例 -**数据类型:**无数据 +**数据类型:** 无数据 ### `caption.windowHeight.change` -**介绍:**字幕窗口宽度发生改变 +**介绍:** 字幕窗口宽度发生改变 -**发起方:**前端字幕窗口 +**发起方:** 前端字幕窗口 -**接收方:**后端字幕窗口实例 +**接收方:** 后端字幕窗口实例 -**数据类型:**`number` +**数据类型:** `number` ### `caption.pin.set` -**介绍:**是否将窗口置顶 +**介绍:** 是否将窗口置顶 -**发起方:**前端字幕窗口 +**发起方:** 前端字幕窗口 -**接收方:**后端字幕窗口实例 +**接收方:** 后端字幕窗口实例 -**数据类型:**`boolean` +**数据类型:** `boolean` ### `caption.controlWindow.activate` -**介绍:**激活控制窗口 +**介绍:** 激活控制窗口 -**发起方:**前端字幕窗口 +**发起方:** 前端字幕窗口 -**接收方:**后端字幕窗口实例 +**接收方:** 后端字幕窗口实例 -**数据类型:**无数据 +**数据类型:** 无数据 ### `caption.window.close` -**介绍:**关闭字幕窗口 +**介绍:** 关闭字幕窗口 -**发起方:**前端字幕窗口 +**发起方:** 前端字幕窗口 -**接收方:**后端字幕窗口实例 +**接收方:** 后端字幕窗口实例 -**数据类型:**无数据 +**数据类型:** 无数据 ## 后端 ==> 前端 ### `control.uiLanguage.set` -**介绍:**后端将最新界面语言发送给前端,前端进行设置 +**介绍:** 后端将最新界面语言发送给前端,前端进行设置 -**发起方:**后端 +**发起方:** 后端 -**接收方:**字幕窗口 +**接收方:** 字幕窗口 -**数据类型:**`UILanguage` +**数据类型:** `UILanguage` ### `control.nativeTheme.change` -**介绍:**系统主题发生改变 +**介绍:** 系统主题发生改变 -**发起方:**后端 +**发起方:** 后端 -**接收方:**前端控制窗口 +**接收方:** 前端控制窗口 -**数据类型:**`string` +**数据类型:** `string` ### `control.engine.started` -**介绍:**引擎启动成功 +**介绍:** 引擎启动成功 -**发起方:**后端 +**发起方:** 后端 -**接收方:**前端控制窗口 +**接收方:** 前端控制窗口 -**数据类型:**无数据 +**数据类型:** 无数据 ### `control.engine.stopped` -**介绍:**引擎关闭 +**介绍:** 引擎关闭 -**发起方:**后端 +**发起方:** 后端 -**接收方:**前端控制窗口 +**接收方:** 前端控制窗口 -**数据类型:**无数据 +**数据类型:** 无数据 ### `control.error.occurred` -**介绍:**发送错误 +**介绍:** 发送错误 -**发起方:**后端 +**发起方:** 后端 -**接收方:**前端控制窗口 +**接收方:** 前端控制窗口 -**数据类型:**`string` +**数据类型:** `string` ### `control.controls.set` -**介绍:**后端将最新字幕引擎配置发送给前端,前端进行设置 +**介绍:** 后端将最新字幕引擎配置发送给前端,前端进行设置 -**发起方:**后端 +**发起方:** 后端 -**接收方:**前端控制窗口 +**接收方:** 前端控制窗口 -**数据类型:**`Controls` +**数据类型:** `Controls` ### `both.styles.set` -**介绍:**后端将最新字幕样式发送给前端,前端进行设置 +**介绍:** 后端将最新字幕样式发送给前端,前端进行设置 -**发起方:**后端 +**发起方:** 后端 -**接收方:**前端 +**接收方:** 前端 -**数据类型:**`Styles` +**数据类型:** `Styles` ### `both.captionLog.add` -**介绍:**添加一条新的字幕数据 +**介绍:** 添加一条新的字幕数据 -**发起方:**后端 +**发起方:** 后端 -**接收方:**前端 +**接收方:** 前端 -**数据类型:**`CaptionItem` +**数据类型:** `CaptionItem` ### `both.captionLog.upd` -**介绍:**更新最后一条字幕数据 +**介绍:** 更新最后一条字幕数据 -**发起方:**后端 +**发起方:** 后端 -**接收方:**前端 +**接收方:** 前端 -**数据类型:**`CaptionItem` +**数据类型:** `CaptionItem` ### `both.captionLog.set` -**介绍:**设置全部的字幕数据 +**介绍:** 设置全部的字幕数据 -**发起方:**后端 +**发起方:** 后端 -**接收方:**前端 +**接收方:** 前端 -**数据类型:**`CaptionItem[]` +**数据类型:** `CaptionItem[]` diff --git a/engine-test/gummy.ipynb b/engine-test/gummy.ipynb new file mode 100644 index 0000000..501fd83 --- /dev/null +++ b/engine-test/gummy.ipynb @@ -0,0 +1,221 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from dashscope.audio.asr import *\n", + "import pyaudiowpatch as pyaudio\n", + "import numpy as np\n", + "\n", + "\n", + "def getDefaultSpeakers(mic: pyaudio.PyAudio, info = True):\n", + " \"\"\"\n", + " 获取默认的系统音频输出的回环设备\n", + " Args:\n", + " mic (pyaudio.PyAudio): pyaudio对象\n", + " info (bool, optional): 是否打印设备信息. Defaults to True.\n", + "\n", + " Returns:\n", + " dict: 统音频输出的回环设备\n", + " \"\"\"\n", + " try:\n", + " WASAPI_info = mic.get_host_api_info_by_type(pyaudio.paWASAPI)\n", + " except OSError:\n", + " print(\"Looks like WASAPI is not available on the system. Exiting...\")\n", + " exit()\n", + "\n", + " default_speaker = mic.get_device_info_by_index(WASAPI_info[\"defaultOutputDevice\"])\n", + " if(info): print(\"wasapi_info:\\n\", WASAPI_info, \"\\n\")\n", + " if(info): print(\"default_speaker:\\n\", default_speaker, \"\\n\")\n", + "\n", + " if not default_speaker[\"isLoopbackDevice\"]:\n", + " for loopback in mic.get_loopback_device_info_generator():\n", + " if default_speaker[\"name\"] in loopback[\"name\"]:\n", + " default_speaker = loopback\n", + " if(info): print(\"Using loopback device:\\n\", default_speaker, \"\\n\")\n", + " break\n", + " else:\n", + " print(\"Default loopback output device not found.\")\n", + " print(\"Run `python -m pyaudiowpatch` to check available devices.\")\n", + " print(\"Exiting...\")\n", + " exit()\n", + " \n", + " if(info): print(f\"Recording Device: #{default_speaker['index']} {default_speaker['name']}\")\n", + " return default_speaker\n", + "\n", + "\n", + "class Callback(TranslationRecognizerCallback):\n", + " \"\"\"\n", + " 语音大模型流式传输回调对象\n", + " \"\"\"\n", + " def __init__(self):\n", + " super().__init__()\n", + " self.usage = 0\n", + " self.sentences = []\n", + " self.translations = []\n", + " \n", + " def on_open(self) -> None:\n", + " print(\"\\n流式翻译开始...\\n\")\n", + "\n", + " def on_close(self) -> None:\n", + " print(f\"\\nTokens消耗:{self.usage}\")\n", + " print(f\"流式翻译结束...\\n\")\n", + " for i in range(len(self.sentences)):\n", + " print(f\"\\n{self.sentences[i]}\\n{self.translations[i]}\\n\")\n", + "\n", + " def on_event(\n", + " self,\n", + " request_id,\n", + " transcription_result: TranscriptionResult,\n", + " translation_result: TranslationResult,\n", + " usage\n", + " ) -> None:\n", + " if transcription_result is not None:\n", + " id = transcription_result.sentence_id\n", + " text = transcription_result.text\n", + " if transcription_result.stash is not None:\n", + " stash = transcription_result.stash.text\n", + " else:\n", + " stash = \"\"\n", + " print(f\"#{id}: {text}{stash}\")\n", + " if usage: self.sentences.append(text)\n", + " \n", + " if translation_result is not None:\n", + " lang = translation_result.get_language_list()[0]\n", + " text = translation_result.get_translation(lang).text\n", + " if translation_result.get_translation(lang).stash is not None:\n", + " stash = translation_result.get_translation(lang).stash.text\n", + " else:\n", + " stash = \"\"\n", + " print(f\"#{lang}: {text}{stash}\")\n", + " if usage: self.translations.append(text)\n", + " \n", + " if usage: self.usage += usage['duration']" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "采样输入设备:\n", + " - 序号:26\n", + " - 名称:耳机 (HUAWEI FreeLace 活力版) [Loopback]\n", + " - 最大输入通道数:2\n", + " - 默认低输入延迟:0.003s\n", + " - 默认高输入延迟:0.01s\n", + " - 默认采样率:48000.0Hz\n", + " - 是否回环设备:True\n", + "\n", + "音频样本块大小:4800\n", + "样本位宽:2\n", + "音频数据格式:8\n", + "音频通道数:2\n", + "音频采样率:48000\n", + "\n" + ] + } + ], + "source": [ + "mic = pyaudio.PyAudio()\n", + "default_speaker = getDefaultSpeakers(mic, False)\n", + "\n", + "SAMP_WIDTH = pyaudio.get_sample_size(pyaudio.paInt16)\n", + "FORMAT = pyaudio.paInt16\n", + "CHANNELS = default_speaker[\"maxInputChannels\"]\n", + "RATE = int(default_speaker[\"defaultSampleRate\"])\n", + "CHUNK = RATE // 10\n", + "INDEX = default_speaker[\"index\"]\n", + "\n", + "dev_info = f\"\"\"\n", + "采样输入设备:\n", + " - 序号:{default_speaker['index']}\n", + " - 名称:{default_speaker['name']}\n", + " - 最大输入通道数:{default_speaker['maxInputChannels']}\n", + " - 默认低输入延迟:{default_speaker['defaultLowInputLatency']}s\n", + " - 默认高输入延迟:{default_speaker['defaultHighInputLatency']}s\n", + " - 默认采样率:{default_speaker['defaultSampleRate']}Hz\n", + " - 是否回环设备:{default_speaker['isLoopbackDevice']}\n", + "\n", + "音频样本块大小:{CHUNK}\n", + "样本位宽:{SAMP_WIDTH}\n", + "音频数据格式:{FORMAT}\n", + "音频通道数:{CHANNELS}\n", + "音频采样率:{RATE}\n", + "\"\"\"\n", + "print(dev_info)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "RECORD_SECONDS = 20 # 监听时长(s)\n", + "\n", + "stream = mic.open(\n", + " format = FORMAT,\n", + " channels = CHANNELS,\n", + " rate = RATE,\n", + " input = True,\n", + " input_device_index = INDEX\n", + ")\n", + "translator = TranslationRecognizerRealtime(\n", + " model = \"gummy-realtime-v1\",\n", + " format = \"pcm\",\n", + " sample_rate = RATE,\n", + " transcription_enabled = True,\n", + " translation_enabled = True,\n", + " source_language = \"ja\",\n", + " translation_target_languages = [\"zh\"],\n", + " callback = Callback()\n", + ")\n", + "translator.start()\n", + "\n", + "for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):\n", + " data = stream.read(CHUNK)\n", + " data_np = np.frombuffer(data, dtype=np.int16)\n", + " data_np_r = data_np.reshape(-1, CHANNELS)\n", + " print(data_np_r.shape)\n", + " mono_data = np.mean(data_np_r.astype(np.float32), axis=1)\n", + " mono_data = mono_data.astype(np.int16)\n", + " mono_data_bytes = mono_data.tobytes()\n", + " translator.send_audio_frame(mono_data_bytes)\n", + "\n", + "translator.stop()\n", + "stream.stop_stream()\n", + "stream.close()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mystd", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/engine-test/process.ipynb b/engine-test/process.ipynb new file mode 100644 index 0000000..51c3309 --- /dev/null +++ b/engine-test/process.ipynb @@ -0,0 +1,82 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 6, + "id": "1e12f3ef", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " 采样设备:\n", + " - 设备类型:音频输入\n", + " - 序号:20\n", + " - 名称:扬声器 (Realtek(R) Audio) [Loopback]\n", + " - 最大输入通道数:2\n", + " - 默认低输入延迟:0.003s\n", + " - 默认高输入延迟:0.01s\n", + " - 默认采样率:48000.0Hz\n", + " - 是否回环设备:True\n", + "\n", + " 音频样本块大小:2400\n", + " 样本位宽:2\n", + " 音频数据格式:8\n", + " 音频通道数:2\n", + " 音频采样率:48000\n", + " \n" + ] + } + ], + "source": [ + "import sys\n", + "import os\n", + "\n", + "current_dir = os.getcwd() \n", + "sys.path.append(os.path.join(current_dir, '../caption-engine'))\n", + "\n", + "from sysaudio.win import AudioStream\n", + "\n", + "stream = AudioStream()\n", + "stream.printInfo()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4c8ad80", + "metadata": {}, + "outputs": [], + "source": [ + "stream.openStream()\n", + "SEC = 2\n", + "for i in range(SEC * 20):\n", + " data = stream.stream.read(stream.CHUNK) # type: ignore\n", + " " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mystd", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}