mirror of
https://github.com/HiMeditator/auto-caption.git
synced 2026-02-04 04:14:42 +08:00
- 重构 GummyTranslator 类,增加启动和停止方法 - 优化 AudioStream 类,添加读取音频数据方法 - 更新 main-gummy.py,使用新的 GummyTranslator 和 AudioStream 接口 - 更新文档和 TODO 列表
222 lines
7.7 KiB
Plaintext
222 lines
7.7 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from dashscope.audio.asr import *\n",
|
||
"import pyaudiowpatch as pyaudio\n",
|
||
"import numpy as np\n",
|
||
"\n",
|
||
"\n",
|
||
"def getDefaultSpeakers(mic: pyaudio.PyAudio, info = True):\n",
|
||
" \"\"\"\n",
|
||
" 获取默认的系统音频输出的回环设备\n",
|
||
" Args:\n",
|
||
" mic (pyaudio.PyAudio): pyaudio对象\n",
|
||
" info (bool, optional): 是否打印设备信息. Defaults to True.\n",
|
||
"\n",
|
||
" Returns:\n",
|
||
" dict: 统音频输出的回环设备\n",
|
||
" \"\"\"\n",
|
||
" try:\n",
|
||
" WASAPI_info = mic.get_host_api_info_by_type(pyaudio.paWASAPI)\n",
|
||
" except OSError:\n",
|
||
" print(\"Looks like WASAPI is not available on the system. Exiting...\")\n",
|
||
" exit()\n",
|
||
"\n",
|
||
" default_speaker = mic.get_device_info_by_index(WASAPI_info[\"defaultOutputDevice\"])\n",
|
||
" if(info): print(\"wasapi_info:\\n\", WASAPI_info, \"\\n\")\n",
|
||
" if(info): print(\"default_speaker:\\n\", default_speaker, \"\\n\")\n",
|
||
"\n",
|
||
" if not default_speaker[\"isLoopbackDevice\"]:\n",
|
||
" for loopback in mic.get_loopback_device_info_generator():\n",
|
||
" if default_speaker[\"name\"] in loopback[\"name\"]:\n",
|
||
" default_speaker = loopback\n",
|
||
" if(info): print(\"Using loopback device:\\n\", default_speaker, \"\\n\")\n",
|
||
" break\n",
|
||
" else:\n",
|
||
" print(\"Default loopback output device not found.\")\n",
|
||
" print(\"Run `python -m pyaudiowpatch` to check available devices.\")\n",
|
||
" print(\"Exiting...\")\n",
|
||
" exit()\n",
|
||
" \n",
|
||
" if(info): print(f\"Recording Device: #{default_speaker['index']} {default_speaker['name']}\")\n",
|
||
" return default_speaker\n",
|
||
"\n",
|
||
"\n",
|
||
"class Callback(TranslationRecognizerCallback):\n",
|
||
" \"\"\"\n",
|
||
" 语音大模型流式传输回调对象\n",
|
||
" \"\"\"\n",
|
||
" def __init__(self):\n",
|
||
" super().__init__()\n",
|
||
" self.usage = 0\n",
|
||
" self.sentences = []\n",
|
||
" self.translations = []\n",
|
||
" \n",
|
||
" def on_open(self) -> None:\n",
|
||
" print(\"\\n流式翻译开始...\\n\")\n",
|
||
"\n",
|
||
" def on_close(self) -> None:\n",
|
||
" print(f\"\\nTokens消耗:{self.usage}\")\n",
|
||
" print(f\"流式翻译结束...\\n\")\n",
|
||
" for i in range(len(self.sentences)):\n",
|
||
" print(f\"\\n{self.sentences[i]}\\n{self.translations[i]}\\n\")\n",
|
||
"\n",
|
||
" def on_event(\n",
|
||
" self,\n",
|
||
" request_id,\n",
|
||
" transcription_result: TranscriptionResult,\n",
|
||
" translation_result: TranslationResult,\n",
|
||
" usage\n",
|
||
" ) -> None:\n",
|
||
" if transcription_result is not None:\n",
|
||
" id = transcription_result.sentence_id\n",
|
||
" text = transcription_result.text\n",
|
||
" if transcription_result.stash is not None:\n",
|
||
" stash = transcription_result.stash.text\n",
|
||
" else:\n",
|
||
" stash = \"\"\n",
|
||
" print(f\"#{id}: {text}{stash}\")\n",
|
||
" if usage: self.sentences.append(text)\n",
|
||
" \n",
|
||
" if translation_result is not None:\n",
|
||
" lang = translation_result.get_language_list()[0]\n",
|
||
" text = translation_result.get_translation(lang).text\n",
|
||
" if translation_result.get_translation(lang).stash is not None:\n",
|
||
" stash = translation_result.get_translation(lang).stash.text\n",
|
||
" else:\n",
|
||
" stash = \"\"\n",
|
||
" print(f\"#{lang}: {text}{stash}\")\n",
|
||
" if usage: self.translations.append(text)\n",
|
||
" \n",
|
||
" if usage: self.usage += usage['duration']"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\n",
|
||
"采样输入设备:\n",
|
||
" - 序号:26\n",
|
||
" - 名称:耳机 (HUAWEI FreeLace 活力版) [Loopback]\n",
|
||
" - 最大输入通道数:2\n",
|
||
" - 默认低输入延迟:0.003s\n",
|
||
" - 默认高输入延迟:0.01s\n",
|
||
" - 默认采样率:48000.0Hz\n",
|
||
" - 是否回环设备:True\n",
|
||
"\n",
|
||
"音频样本块大小:4800\n",
|
||
"样本位宽:2\n",
|
||
"音频数据格式:8\n",
|
||
"音频通道数:2\n",
|
||
"音频采样率:48000\n",
|
||
"\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"mic = pyaudio.PyAudio()\n",
|
||
"default_speaker = getDefaultSpeakers(mic, False)\n",
|
||
"\n",
|
||
"SAMP_WIDTH = pyaudio.get_sample_size(pyaudio.paInt16)\n",
|
||
"FORMAT = pyaudio.paInt16\n",
|
||
"CHANNELS = default_speaker[\"maxInputChannels\"]\n",
|
||
"RATE = int(default_speaker[\"defaultSampleRate\"])\n",
|
||
"CHUNK = RATE // 10\n",
|
||
"INDEX = default_speaker[\"index\"]\n",
|
||
"\n",
|
||
"dev_info = f\"\"\"\n",
|
||
"采样输入设备:\n",
|
||
" - 序号:{default_speaker['index']}\n",
|
||
" - 名称:{default_speaker['name']}\n",
|
||
" - 最大输入通道数:{default_speaker['maxInputChannels']}\n",
|
||
" - 默认低输入延迟:{default_speaker['defaultLowInputLatency']}s\n",
|
||
" - 默认高输入延迟:{default_speaker['defaultHighInputLatency']}s\n",
|
||
" - 默认采样率:{default_speaker['defaultSampleRate']}Hz\n",
|
||
" - 是否回环设备:{default_speaker['isLoopbackDevice']}\n",
|
||
"\n",
|
||
"音频样本块大小:{CHUNK}\n",
|
||
"样本位宽:{SAMP_WIDTH}\n",
|
||
"音频数据格式:{FORMAT}\n",
|
||
"音频通道数:{CHANNELS}\n",
|
||
"音频采样率:{RATE}\n",
|
||
"\"\"\"\n",
|
||
"print(dev_info)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"RECORD_SECONDS = 20 # 监听时长(s)\n",
|
||
"\n",
|
||
"stream = mic.open(\n",
|
||
" format = FORMAT,\n",
|
||
" channels = CHANNELS,\n",
|
||
" rate = RATE,\n",
|
||
" input = True,\n",
|
||
" input_device_index = INDEX\n",
|
||
")\n",
|
||
"translator = TranslationRecognizerRealtime(\n",
|
||
" model = \"gummy-realtime-v1\",\n",
|
||
" format = \"pcm\",\n",
|
||
" sample_rate = RATE,\n",
|
||
" transcription_enabled = True,\n",
|
||
" translation_enabled = True,\n",
|
||
" source_language = \"ja\",\n",
|
||
" translation_target_languages = [\"zh\"],\n",
|
||
" callback = Callback()\n",
|
||
")\n",
|
||
"translator.start()\n",
|
||
"\n",
|
||
"for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):\n",
|
||
" data = stream.read(CHUNK)\n",
|
||
" data_np = np.frombuffer(data, dtype=np.int16)\n",
|
||
" data_np_r = data_np.reshape(-1, CHANNELS)\n",
|
||
" print(data_np_r.shape)\n",
|
||
" mono_data = np.mean(data_np_r.astype(np.float32), axis=1)\n",
|
||
" mono_data = mono_data.astype(np.int16)\n",
|
||
" mono_data_bytes = mono_data.tobytes()\n",
|
||
" translator.send_audio_frame(mono_data_bytes)\n",
|
||
"\n",
|
||
"translator.stop()\n",
|
||
"stream.stop_stream()\n",
|
||
"stream.close()"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "mystd",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.10.12"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|