{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from dashscope.audio.asr import * # type: ignore\n", "import pyaudiowpatch as pyaudio\n", "import numpy as np\n", "\n", "\n", "def getDefaultSpeakers(mic: pyaudio.PyAudio, info = True):\n", " \"\"\"\n", " 获取默认的系统音频输出的回环设备\n", " Args:\n", " mic (pyaudio.PyAudio): pyaudio对象\n", " info (bool, optional): 是否打印设备信息. Defaults to True.\n", "\n", " Returns:\n", " dict: 统音频输出的回环设备\n", " \"\"\"\n", " try:\n", " WASAPI_info = mic.get_host_api_info_by_type(pyaudio.paWASAPI)\n", " except OSError:\n", " print(\"Looks like WASAPI is not available on the system. Exiting...\")\n", " exit()\n", "\n", " default_speaker = mic.get_device_info_by_index(WASAPI_info[\"defaultOutputDevice\"])\n", " if(info): print(\"wasapi_info:\\n\", WASAPI_info, \"\\n\")\n", " if(info): print(\"default_speaker:\\n\", default_speaker, \"\\n\")\n", "\n", " if not default_speaker[\"isLoopbackDevice\"]:\n", " for loopback in mic.get_loopback_device_info_generator():\n", " if default_speaker[\"name\"] in loopback[\"name\"]:\n", " default_speaker = loopback\n", " if(info): print(\"Using loopback device:\\n\", default_speaker, \"\\n\")\n", " break\n", " else:\n", " print(\"Default loopback output device not found.\")\n", " print(\"Run `python -m pyaudiowpatch` to check available devices.\")\n", " print(\"Exiting...\")\n", " exit()\n", " \n", " if(info): print(f\"Recording Device: #{default_speaker['index']} {default_speaker['name']}\")\n", " return default_speaker\n", "\n", "\n", "class Callback(TranslationRecognizerCallback):\n", " \"\"\"\n", " 语音大模型流式传输回调对象\n", " \"\"\"\n", " def __init__(self):\n", " super().__init__()\n", " self.usage = 0\n", " self.sentences = []\n", " self.translations = []\n", " \n", " def on_open(self) -> None:\n", " print(\"\\n流式翻译开始...\\n\")\n", "\n", " def on_close(self) -> None:\n", " print(f\"\\nTokens消耗:{self.usage}\")\n", " print(f\"流式翻译结束...\\n\")\n", " for i in range(len(self.sentences)):\n", " print(f\"\\n{self.sentences[i]}\\n{self.translations[i]}\\n\")\n", "\n", " def on_event(\n", " self,\n", " request_id,\n", " transcription_result: TranscriptionResult,\n", " translation_result: TranslationResult,\n", " usage\n", " ) -> None:\n", " if transcription_result is not None:\n", " id = transcription_result.sentence_id\n", " text = transcription_result.text\n", " if transcription_result.stash is not None:\n", " stash = transcription_result.stash.text\n", " else:\n", " stash = \"\"\n", " print(f\"#{id}: {text}{stash}\")\n", " if usage: self.sentences.append(text)\n", " \n", " if translation_result is not None:\n", " lang = translation_result.get_language_list()[0]\n", " text = translation_result.get_translation(lang).text\n", " if translation_result.get_translation(lang).stash is not None:\n", " stash = translation_result.get_translation(lang).stash.text\n", " else:\n", " stash = \"\"\n", " print(f\"#{lang}: {text}{stash}\")\n", " if usage: self.translations.append(text)\n", " \n", " if usage: self.usage += usage['duration']" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "采样输入设备:\n", " - 序号:26\n", " - 名称:耳机 (HUAWEI FreeLace 活力版) [Loopback]\n", " - 最大输入通道数:2\n", " - 默认低输入延迟:0.003s\n", " - 默认高输入延迟:0.01s\n", " - 默认采样率:48000.0Hz\n", " - 是否回环设备:True\n", "\n", "音频样本块大小:4800\n", "样本位宽:2\n", "音频数据格式:8\n", "音频通道数:2\n", "音频采样率:48000\n", "\n" ] } ], "source": [ "mic = pyaudio.PyAudio()\n", "default_speaker = getDefaultSpeakers(mic, False)\n", "\n", "SAMP_WIDTH = pyaudio.get_sample_size(pyaudio.paInt16)\n", "FORMAT = pyaudio.paInt16\n", "CHANNELS = default_speaker[\"maxInputChannels\"]\n", "RATE = int(default_speaker[\"defaultSampleRate\"])\n", "CHUNK = RATE // 10\n", "INDEX = default_speaker[\"index\"]\n", "\n", "dev_info = f\"\"\"\n", "采样输入设备:\n", " - 序号:{default_speaker['index']}\n", " - 名称:{default_speaker['name']}\n", " - 最大输入通道数:{default_speaker['maxInputChannels']}\n", " - 默认低输入延迟:{default_speaker['defaultLowInputLatency']}s\n", " - 默认高输入延迟:{default_speaker['defaultHighInputLatency']}s\n", " - 默认采样率:{default_speaker['defaultSampleRate']}Hz\n", " - 是否回环设备:{default_speaker['isLoopbackDevice']}\n", "\n", "音频样本块大小:{CHUNK}\n", "样本位宽:{SAMP_WIDTH}\n", "音频数据格式:{FORMAT}\n", "音频通道数:{CHANNELS}\n", "音频采样率:{RATE}\n", "\"\"\"\n", "print(dev_info)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "RECORD_SECONDS = 20 # 监听时长(s)\n", "\n", "stream = mic.open(\n", " format = FORMAT,\n", " channels = CHANNELS,\n", " rate = RATE,\n", " input = True,\n", " input_device_index = INDEX\n", ")\n", "translator = TranslationRecognizerRealtime(\n", " model = \"gummy-realtime-v1\",\n", " format = \"pcm\",\n", " sample_rate = RATE,\n", " transcription_enabled = True,\n", " translation_enabled = True,\n", " source_language = \"ja\",\n", " translation_target_languages = [\"zh\"],\n", " callback = Callback()\n", ")\n", "translator.start()\n", "\n", "for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):\n", " data = stream.read(CHUNK)\n", " data_np = np.frombuffer(data, dtype=np.int16)\n", " data_np_r = data_np.reshape(-1, CHANNELS)\n", " print(data_np_r.shape)\n", " mono_data = np.mean(data_np_r.astype(np.float32), axis=1)\n", " mono_data = mono_data.astype(np.int16)\n", " mono_data_bytes = mono_data.tobytes()\n", " translator.send_audio_frame(mono_data_bytes)\n", "\n", "translator.stop()\n", "stream.stop_stream()\n", "stream.close()" ] } ], "metadata": { "kernelspec": { "display_name": "mystd", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 2 }