From 0696651f0457dae600fc75cad95bb7523b2d02b4 Mon Sep 17 00:00:00 2001 From: himeditator Date: Mon, 7 Jul 2025 00:47:36 +0800 Subject: [PATCH] =?UTF-8?q?feat(audio):=20=E9=87=8D=E6=9E=84=E9=9F=B3?= =?UTF-8?q?=E9=A2=91=E5=A4=84=E7=90=86=E6=A8=A1=E5=9D=97=E3=80=81=E9=9F=B3?= =?UTF-8?q?=E9=A2=91=E6=B5=81=E9=87=8D=E9=87=87=E6=A0=B7=E6=B5=8B=E8=AF=95?= =?UTF-8?q?=E6=88=90=E5=8A=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + caption-engine/audioprcs/__init__.py | 2 +- caption-engine/audioprcs/process.py | 49 +++++++ caption-engine/audioprcs/streamchnl.py | 22 --- caption-engine/main-gummy.py | 10 +- caption-engine/requirements.txt | 1 + caption-engine/sysaudio/linux.py | 2 +- caption-engine/sysaudio/win.py | 4 +- engine-test/gummy.ipynb | 4 +- engine-test/process.ipynb | 82 ----------- engine-test/resample.ipynb | 191 +++++++++++++++++++++++++ 11 files changed, 253 insertions(+), 115 deletions(-) create mode 100644 caption-engine/audioprcs/process.py delete mode 100644 caption-engine/audioprcs/streamchnl.py delete mode 100644 engine-test/process.ipynb create mode 100644 engine-test/resample.ipynb diff --git a/.gitignore b/.gitignore index 60312fd..d4eaf7e 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ out __pycache__ subenv caption-engine/build +output.wav diff --git a/caption-engine/audioprcs/__init__.py b/caption-engine/audioprcs/__init__.py index 422603b..b63d8eb 100644 --- a/caption-engine/audioprcs/__init__.py +++ b/caption-engine/audioprcs/__init__.py @@ -1 +1 @@ -from .streamchnl import mergeStreamChannels +from .process import mergeChunkChannels, resampleRawChunk diff --git a/caption-engine/audioprcs/process.py b/caption-engine/audioprcs/process.py new file mode 100644 index 0000000..ff9c61a --- /dev/null +++ b/caption-engine/audioprcs/process.py @@ -0,0 +1,49 @@ +import samplerate +import numpy as np + +def mergeChunkChannels(chunk, channels): + """ + 将当前多通道音频数据块转换为单通道音频数据块 + + Args: + chunk: (bytes)多通道音频数据块 + channels: 通道数 + + Returns: + (bytes)单通道音频数据块 + """ + # (length * channels,) + chunk_np = np.frombuffer(chunk, dtype=np.int16) + # (length, channels) + chunk_np = chunk_np.reshape(-1, channels) + # (length,) + chunk_mono_f = np.mean(chunk_np.astype(np.float32), axis=1) + chunk_mono = np.round(chunk_mono_f).astype(np.int16) + return chunk_mono.tobytes() + + +def resampleRawChunk(chunk, channels, orig_sr, target_sr, mode="sinc_best"): + """ + 将当前多通道音频数据块转换成单通道音频数据块,然后进行重采样 + + Args: + chunk: (bytes)多通道音频数据块 + channels: 通道数 + orig_sr: 原始采样率 + target_sr: 目标采样率 + mode: 重采样模式,可选:'sinc_best' | 'sinc_medium' | 'sinc_fastest' | 'zero_order_hold' | 'linear' + + Return: + (bytes)单通道音频数据块 + """ + # (length * channels,) + chunk_np = np.frombuffer(chunk, dtype=np.int16) + # (length, channels) + chunk_np = chunk_np.reshape(-1, channels) + # (length,) + chunk_mono_f = np.mean(chunk_np.astype(np.float32), axis=1) + chunk_mono = chunk_mono_f.astype(np.int16) + ratio = target_sr / orig_sr + chunk_mono_r = samplerate.resample(chunk_mono, ratio, converter_type=mode) + chunk_mono_r = np.round(chunk_mono_r).astype(np.int16) + return chunk_mono_r.tobytes() diff --git a/caption-engine/audioprcs/streamchnl.py b/caption-engine/audioprcs/streamchnl.py deleted file mode 100644 index 08f3184..0000000 --- a/caption-engine/audioprcs/streamchnl.py +++ /dev/null @@ -1,22 +0,0 @@ -import numpy as np - -def mergeStreamChannels(data, channels): - """ - 将当前多通道流数据合并为单通道流数据 - - Args: - data: 多通道数据 - channels: 通道数 - - Returns: - mono_data_bytes: 单通道数据 - """ - # (length * channels,) - data_np = np.frombuffer(data, dtype=np.int16) - # (length, channels) - data_np_r = data_np.reshape(-1, channels) - # (length,) - mono_data = np.mean(data_np_r.astype(np.float32), axis=1) - mono_data = mono_data.astype(np.int16) - mono_data_bytes = mono_data.tobytes() - return mono_data_bytes diff --git a/caption-engine/main-gummy.py b/caption-engine/main-gummy.py index 6b3cbd6..e3981fb 100644 --- a/caption-engine/main-gummy.py +++ b/caption-engine/main-gummy.py @@ -8,7 +8,7 @@ elif sys.platform == 'linux': else: raise NotImplementedError(f"Unsupported platform: {sys.platform}") -from audioprcs import mergeStreamChannels +from audioprcs import mergeChunkChannels from audio2text import InvalidParameter, GummyTranslator @@ -26,13 +26,13 @@ def convert_audio_to_text(s_lang, t_lang, audio_type): while True: try: - data = stream.read_chunk() - data = mergeStreamChannels(data, stream.CHANNELS) + chunk = stream.read_chunk() + chunk_mono = mergeChunkChannels(chunk, stream.CHANNELS) try: - gummy.send_audio_frame(data) + gummy.send_audio_frame(chunk_mono) except InvalidParameter: gummy.start() - gummy.send_audio_frame(data) + gummy.send_audio_frame(chunk_mono) except KeyboardInterrupt: stream.closeStream() gummy.stop() diff --git a/caption-engine/requirements.txt b/caption-engine/requirements.txt index eefe30b..3dc686b 100644 --- a/caption-engine/requirements.txt +++ b/caption-engine/requirements.txt @@ -1,5 +1,6 @@ dashscope==1.23.5 numpy==2.2.6 +samplerate==0.2.1 PyAudio==0.2.14 PyAudioWPatch==0.2.12.7 # Windows only pyinstaller==6.14.1 diff --git a/caption-engine/sysaudio/linux.py b/caption-engine/sysaudio/linux.py index 74ac8a0..92361a0 100644 --- a/caption-engine/sysaudio/linux.py +++ b/caption-engine/sysaudio/linux.py @@ -34,7 +34,7 @@ class AudioStream: 音频样本块大小:{self.CHUNK} 样本位宽:{self.SAMP_WIDTH} - 音频数据格式:{self.FORMAT} + 采样格式:{self.FORMAT} 音频通道数:{self.CHANNELS} 音频采样率:{self.RATE} """ diff --git a/caption-engine/sysaudio/win.py b/caption-engine/sysaudio/win.py index e5fd6b5..fbf17b9 100644 --- a/caption-engine/sysaudio/win.py +++ b/caption-engine/sysaudio/win.py @@ -65,7 +65,7 @@ class AudioStream: def printInfo(self): dev_info = f""" 采样设备: - - 设备类型:{ "音频输入" if self.audio_type == 0 else "音频输出" } + - 设备类型:{ "音频输出" if self.audio_type == 0 else "音频输入" } - 序号:{self.device['index']} - 名称:{self.device['name']} - 最大输入通道数:{self.device['maxInputChannels']} @@ -76,7 +76,7 @@ class AudioStream: 音频样本块大小:{self.CHUNK} 样本位宽:{self.SAMP_WIDTH} - 音频数据格式:{self.FORMAT} + 采样格式:{self.FORMAT} 音频通道数:{self.CHANNELS} 音频采样率:{self.RATE} """ diff --git a/engine-test/gummy.ipynb b/engine-test/gummy.ipynb index 501fd83..f761f0b 100644 --- a/engine-test/gummy.ipynb +++ b/engine-test/gummy.ipynb @@ -2,11 +2,11 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "from dashscope.audio.asr import *\n", + "from dashscope.audio.asr import * # type: ignore\n", "import pyaudiowpatch as pyaudio\n", "import numpy as np\n", "\n", diff --git a/engine-test/process.ipynb b/engine-test/process.ipynb deleted file mode 100644 index 51c3309..0000000 --- a/engine-test/process.ipynb +++ /dev/null @@ -1,82 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 6, - "id": "1e12f3ef", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - " 采样设备:\n", - " - 设备类型:音频输入\n", - " - 序号:20\n", - " - 名称:扬声器 (Realtek(R) Audio) [Loopback]\n", - " - 最大输入通道数:2\n", - " - 默认低输入延迟:0.003s\n", - " - 默认高输入延迟:0.01s\n", - " - 默认采样率:48000.0Hz\n", - " - 是否回环设备:True\n", - "\n", - " 音频样本块大小:2400\n", - " 样本位宽:2\n", - " 音频数据格式:8\n", - " 音频通道数:2\n", - " 音频采样率:48000\n", - " \n" - ] - } - ], - "source": [ - "import sys\n", - "import os\n", - "\n", - "current_dir = os.getcwd() \n", - "sys.path.append(os.path.join(current_dir, '../caption-engine'))\n", - "\n", - "from sysaudio.win import AudioStream\n", - "\n", - "stream = AudioStream()\n", - "stream.printInfo()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d4c8ad80", - "metadata": {}, - "outputs": [], - "source": [ - "stream.openStream()\n", - "SEC = 2\n", - "for i in range(SEC * 20):\n", - " data = stream.stream.read(stream.CHUNK) # type: ignore\n", - " " - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "mystd", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/engine-test/resample.ipynb b/engine-test/resample.ipynb new file mode 100644 index 0000000..798a925 --- /dev/null +++ b/engine-test/resample.ipynb @@ -0,0 +1,191 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "1e12f3ef", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " 采样设备:\n", + " - 设备类型:音频输出\n", + " - 序号:26\n", + " - 名称:耳机 (HUAWEI FreeLace 活力版) [Loopback]\n", + " - 最大输入通道数:2\n", + " - 默认低输入延迟:0.003s\n", + " - 默认高输入延迟:0.01s\n", + " - 默认采样率:48000.0Hz\n", + " - 是否回环设备:True\n", + "\n", + " 音频样本块大小:2400\n", + " 样本位宽:2\n", + " 采样格式:8\n", + " 音频通道数:2\n", + " 音频采样率:48000\n", + " \n" + ] + } + ], + "source": [ + "import sys\n", + "import os\n", + "import pyaudio\n", + "import wave\n", + "\n", + "current_dir = os.getcwd() \n", + "sys.path.append(os.path.join(current_dir, '../caption-engine'))\n", + "\n", + "from sysaudio.win import AudioStream\n", + "from audioprcs import resampleRawChunk, mergeChunkChannels\n", + "\n", + "stream = AudioStream(0)\n", + "stream.printInfo()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a72914f4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Recording...\n", + "Done\n" + ] + } + ], + "source": [ + "\"\"\"获取系统音频输出5秒,然后保存为wav文件\"\"\"\n", + "\n", + "with wave.open('output.wav', 'wb') as wf:\n", + " wf.setnchannels(stream.CHANNELS)\n", + " wf.setsampwidth(stream.SAMP_WIDTH)\n", + " wf.setframerate(stream.RATE)\n", + " stream.openStream()\n", + "\n", + " print('Recording...')\n", + "\n", + " for _ in range(0, 100):\n", + " chunk = stream.read_chunk()\n", + " if isinstance(chunk, bytes):\n", + " wf.writeframes(chunk)\n", + " else:\n", + " raise Exception('Error: chunk is not bytes')\n", + " \n", + " stream.closeStream() \n", + " print('Done')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6e8a098", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Recording...\n", + "Done\n" + ] + } + ], + "source": [ + "\"\"\"获取系统音频输入,转换为单通道音频,持续5秒,然后保存为wav文件\"\"\"\n", + "\n", + "with wave.open('output.wav', 'wb') as wf:\n", + " wf.setnchannels(1)\n", + " wf.setsampwidth(stream.SAMP_WIDTH)\n", + " wf.setframerate(stream.RATE)\n", + " stream.openStream()\n", + "\n", + " print('Recording...')\n", + "\n", + " for _ in range(0, 100):\n", + " chunk = mergeChunkChannels(\n", + " stream.read_chunk(),\n", + " stream.CHANNELS\n", + " )\n", + " if isinstance(chunk, bytes):\n", + " wf.writeframes(chunk)\n", + " else:\n", + " raise Exception('Error: chunk is not bytes')\n", + " \n", + " stream.closeStream() \n", + " print('Done')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "aaca1465", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Recording...\n", + "Done\n" + ] + } + ], + "source": [ + "\"\"\"获取系统音频输入,转换为单通道音频并重采样到16000Hz,持续5秒,然后保存为wav文件\"\"\"\n", + "\n", + "with wave.open('output.wav', 'wb') as wf:\n", + " wf.setnchannels(1)\n", + " wf.setsampwidth(stream.SAMP_WIDTH)\n", + " wf.setframerate(16000)\n", + " stream.openStream()\n", + "\n", + " print('Recording...')\n", + "\n", + " for _ in range(0, 100):\n", + " chunk = resampleRawChunk(\n", + " stream.read_chunk(),\n", + " stream.CHANNELS,\n", + " stream.RATE,\n", + " 16000,\n", + " mode=\"sinc_best\"\n", + " )\n", + " if isinstance(chunk, bytes):\n", + " wf.writeframes(chunk)\n", + " else:\n", + " raise Exception('Error: chunk is not bytes')\n", + " \n", + " stream.closeStream() \n", + " print('Done')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mystd", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}