{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from dashscope.audio.asr import *\n", "import pyaudiowpatch as pyaudio\n", "import numpy as np\n", "\n", "\n", "def getDefaultSpeakers(mic: pyaudio.PyAudio, info = True):\n", " \"\"\"\n", " 获取默认的系统音频输出的回环设备\n", " Args:\n", " mic (pyaudio.PyAudio): pyaudio对象\n", " info (bool, optional): 是否打印设备信息. Defaults to True.\n", "\n", " Returns:\n", " dict: 统音频输出的回环设备\n", " \"\"\"\n", " try:\n", " WASAPI_info = mic.get_host_api_info_by_type(pyaudio.paWASAPI)\n", " except OSError:\n", " print(\"Looks like WASAPI is not available on the system. Exiting...\")\n", " exit()\n", "\n", " default_speaker = mic.get_device_info_by_index(WASAPI_info[\"defaultOutputDevice\"])\n", " if(info): print(\"wasapi_info:\\n\", WASAPI_info, \"\\n\")\n", " if(info): print(\"default_speaker:\\n\", default_speaker, \"\\n\")\n", "\n", " if not default_speaker[\"isLoopbackDevice\"]:\n", " for loopback in mic.get_loopback_device_info_generator():\n", " if default_speaker[\"name\"] in loopback[\"name\"]:\n", " default_speaker = loopback\n", " if(info): print(\"Using loopback device:\\n\", default_speaker, \"\\n\")\n", " break\n", " else:\n", " print(\"Default loopback output device not found.\")\n", " print(\"Run `python -m pyaudiowpatch` to check available devices.\")\n", " print(\"Exiting...\")\n", " exit()\n", " \n", " if(info): print(f\"Recording Device: #{default_speaker['index']} {default_speaker['name']}\")\n", " return default_speaker\n", "\n", "\n", "class Callback(TranslationRecognizerCallback):\n", " \"\"\"\n", " 语音大模型流式传输回调对象\n", " \"\"\"\n", " def __init__(self):\n", " super().__init__()\n", " self.usage = 0\n", " self.sentences = []\n", " self.translations = []\n", " \n", " def on_open(self) -> None:\n", " print(\"\\n流式翻译开始...\\n\")\n", "\n", " def on_close(self) -> None:\n", " print(f\"\\nTokens消耗:{self.usage}\")\n", " print(f\"流式翻译结束...\\n\")\n", " for i in range(len(self.sentences)):\n", " print(f\"\\n{self.sentences[i]}\\n{self.translations[i]}\\n\")\n", "\n", " def on_event(\n", " self,\n", " request_id,\n", " transcription_result: TranscriptionResult,\n", " translation_result: TranslationResult,\n", " usage\n", " ) -> None:\n", " if transcription_result is not None:\n", " id = transcription_result.sentence_id\n", " text = transcription_result.text\n", " if transcription_result.stash is not None:\n", " stash = transcription_result.stash.text\n", " else:\n", " stash = \"\"\n", " print(f\"#{id}: {text}{stash}\")\n", " if usage: self.sentences.append(text)\n", " \n", " if translation_result is not None:\n", " lang = translation_result.get_language_list()[0]\n", " text = translation_result.get_translation(lang).text\n", " if translation_result.get_translation(lang).stash is not None:\n", " stash = translation_result.get_translation(lang).stash.text\n", " else:\n", " stash = \"\"\n", " print(f\"#{lang}: {text}{stash}\")\n", " if usage: self.translations.append(text)\n", " \n", " if usage: self.usage += usage['duration']" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "采样输入设备:\n", " - 序号:37\n", " - 名称:耳机 (HUAWEI FreeLace 活力版) [Loopback]\n", " - 最大输入通道数:2\n", " - 默认低输入延迟:0.003s\n", " - 默认高输入延迟:0.01s\n", " - 默认采样率:44100.0Hz\n", " - 是否回环设备:True\n", "\n", "音频样本块大小:4410\n", "样本位宽:2\n", "音频数据格式:8\n", "音频通道数:2\n", "音频采样率:44100\n", "\n" ] } ], "source": [ "mic = pyaudio.PyAudio()\n", "default_speaker = getDefaultSpeakers(mic, False)\n", "\n", "SAMP_WIDTH = pyaudio.get_sample_size(pyaudio.paInt16)\n", "FORMAT = pyaudio.paInt16\n", "CHANNELS = default_speaker[\"maxInputChannels\"]\n", "RATE = int(default_speaker[\"defaultSampleRate\"])\n", "CHUNK = RATE // 10\n", "INDEX = default_speaker[\"index\"]\n", "\n", "dev_info = f\"\"\"\n", "采样输入设备:\n", " - 序号:{default_speaker['index']}\n", " - 名称:{default_speaker['name']}\n", " - 最大输入通道数:{default_speaker['maxInputChannels']}\n", " - 默认低输入延迟:{default_speaker['defaultLowInputLatency']}s\n", " - 默认高输入延迟:{default_speaker['defaultHighInputLatency']}s\n", " - 默认采样率:{default_speaker['defaultSampleRate']}Hz\n", " - 是否回环设备:{default_speaker['isLoopbackDevice']}\n", "\n", "音频样本块大小:{CHUNK}\n", "样本位宽:{SAMP_WIDTH}\n", "音频数据格式:{FORMAT}\n", "音频通道数:{CHANNELS}\n", "音频采样率:{RATE}\n", "\"\"\"\n", "print(dev_info)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "流式翻译开始...\n", "\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n", "(4410, 2)\n" ] }, { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_29036\\3259296939.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 21\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 22\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mRATE\u001b[0m \u001b[1;33m/\u001b[0m \u001b[0mCHUNK\u001b[0m \u001b[1;33m*\u001b[0m \u001b[0mRECORD_SECONDS\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 23\u001b[1;33m \u001b[0mdata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mstream\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mCHUNK\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 24\u001b[0m \u001b[0mdata_np\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfrombuffer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mint16\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 25\u001b[0m \u001b[0mdata_np_r\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdata_np\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m-\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mCHANNELS\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32md:\\ML\\anaconda3\\envs\\mystd\\lib\\site-packages\\pyaudiowpatch\\__init__.py\u001b[0m in \u001b[0;36mread\u001b[1;34m(self, num_frames, exception_on_overflow)\u001b[0m\n\u001b[0;32m 638\u001b[0m paCanNotReadFromAnOutputOnlyStream)\n\u001b[0;32m 639\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 640\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mpa\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread_stream\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_stream\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnum_frames\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mexception_on_overflow\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 641\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 642\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mget_read_available\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "RECORD_SECONDS = 20 # 监听时长(s)\n", "\n", "stream = mic.open(\n", " format = FORMAT,\n", " channels = CHANNELS,\n", " rate = RATE,\n", " input = True,\n", " input_device_index = INDEX\n", ")\n", "translator = TranslationRecognizerRealtime(\n", " model = \"gummy-realtime-v1\",\n", " format = \"pcm\",\n", " sample_rate = RATE,\n", " transcription_enabled = True,\n", " translation_enabled = True,\n", " source_language = \"ja\",\n", " translation_target_languages = [\"zh\"],\n", " callback = Callback()\n", ")\n", "translator.start()\n", "\n", "for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):\n", " data = stream.read(CHUNK)\n", " data_np = np.frombuffer(data, dtype=np.int16)\n", " data_np_r = data_np.reshape(-1, CHANNELS)\n", " print(data_np_r.shape)\n", " mono_data = np.mean(data_np_r.astype(np.float32), axis=1)\n", " mono_data = mono_data.astype(np.int16)\n", " mono_data_bytes = mono_data.tobytes()\n", " translator.send_audio_frame(mono_data_bytes)\n", "\n", "translator.stop()\n", "stream.stop_stream()\n", "stream.close()" ] } ], "metadata": { "kernelspec": { "display_name": "mystd", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 2 }