mirror of
https://github.com/HiMeditator/auto-caption.git
synced 2026-02-15 04:14:46 +08:00
feat(audio): 重构音频处理模块、音频流重采样测试成功
This commit is contained in:
@@ -1 +1 @@
|
||||
from .streamchnl import mergeStreamChannels
|
||||
from .process import mergeChunkChannels, resampleRawChunk
|
||||
|
||||
49
caption-engine/audioprcs/process.py
Normal file
49
caption-engine/audioprcs/process.py
Normal file
@@ -0,0 +1,49 @@
|
||||
import samplerate
|
||||
import numpy as np
|
||||
|
||||
def mergeChunkChannels(chunk, channels):
|
||||
"""
|
||||
将当前多通道音频数据块转换为单通道音频数据块
|
||||
|
||||
Args:
|
||||
chunk: (bytes)多通道音频数据块
|
||||
channels: 通道数
|
||||
|
||||
Returns:
|
||||
(bytes)单通道音频数据块
|
||||
"""
|
||||
# (length * channels,)
|
||||
chunk_np = np.frombuffer(chunk, dtype=np.int16)
|
||||
# (length, channels)
|
||||
chunk_np = chunk_np.reshape(-1, channels)
|
||||
# (length,)
|
||||
chunk_mono_f = np.mean(chunk_np.astype(np.float32), axis=1)
|
||||
chunk_mono = np.round(chunk_mono_f).astype(np.int16)
|
||||
return chunk_mono.tobytes()
|
||||
|
||||
|
||||
def resampleRawChunk(chunk, channels, orig_sr, target_sr, mode="sinc_best"):
|
||||
"""
|
||||
将当前多通道音频数据块转换成单通道音频数据块,然后进行重采样
|
||||
|
||||
Args:
|
||||
chunk: (bytes)多通道音频数据块
|
||||
channels: 通道数
|
||||
orig_sr: 原始采样率
|
||||
target_sr: 目标采样率
|
||||
mode: 重采样模式,可选:'sinc_best' | 'sinc_medium' | 'sinc_fastest' | 'zero_order_hold' | 'linear'
|
||||
|
||||
Return:
|
||||
(bytes)单通道音频数据块
|
||||
"""
|
||||
# (length * channels,)
|
||||
chunk_np = np.frombuffer(chunk, dtype=np.int16)
|
||||
# (length, channels)
|
||||
chunk_np = chunk_np.reshape(-1, channels)
|
||||
# (length,)
|
||||
chunk_mono_f = np.mean(chunk_np.astype(np.float32), axis=1)
|
||||
chunk_mono = chunk_mono_f.astype(np.int16)
|
||||
ratio = target_sr / orig_sr
|
||||
chunk_mono_r = samplerate.resample(chunk_mono, ratio, converter_type=mode)
|
||||
chunk_mono_r = np.round(chunk_mono_r).astype(np.int16)
|
||||
return chunk_mono_r.tobytes()
|
||||
@@ -1,22 +0,0 @@
|
||||
import numpy as np
|
||||
|
||||
def mergeStreamChannels(data, channels):
|
||||
"""
|
||||
将当前多通道流数据合并为单通道流数据
|
||||
|
||||
Args:
|
||||
data: 多通道数据
|
||||
channels: 通道数
|
||||
|
||||
Returns:
|
||||
mono_data_bytes: 单通道数据
|
||||
"""
|
||||
# (length * channels,)
|
||||
data_np = np.frombuffer(data, dtype=np.int16)
|
||||
# (length, channels)
|
||||
data_np_r = data_np.reshape(-1, channels)
|
||||
# (length,)
|
||||
mono_data = np.mean(data_np_r.astype(np.float32), axis=1)
|
||||
mono_data = mono_data.astype(np.int16)
|
||||
mono_data_bytes = mono_data.tobytes()
|
||||
return mono_data_bytes
|
||||
@@ -8,7 +8,7 @@ elif sys.platform == 'linux':
|
||||
else:
|
||||
raise NotImplementedError(f"Unsupported platform: {sys.platform}")
|
||||
|
||||
from audioprcs import mergeStreamChannels
|
||||
from audioprcs import mergeChunkChannels
|
||||
from audio2text import InvalidParameter, GummyTranslator
|
||||
|
||||
|
||||
@@ -26,13 +26,13 @@ def convert_audio_to_text(s_lang, t_lang, audio_type):
|
||||
|
||||
while True:
|
||||
try:
|
||||
data = stream.read_chunk()
|
||||
data = mergeStreamChannels(data, stream.CHANNELS)
|
||||
chunk = stream.read_chunk()
|
||||
chunk_mono = mergeChunkChannels(chunk, stream.CHANNELS)
|
||||
try:
|
||||
gummy.send_audio_frame(data)
|
||||
gummy.send_audio_frame(chunk_mono)
|
||||
except InvalidParameter:
|
||||
gummy.start()
|
||||
gummy.send_audio_frame(data)
|
||||
gummy.send_audio_frame(chunk_mono)
|
||||
except KeyboardInterrupt:
|
||||
stream.closeStream()
|
||||
gummy.stop()
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
dashscope==1.23.5
|
||||
numpy==2.2.6
|
||||
samplerate==0.2.1
|
||||
PyAudio==0.2.14
|
||||
PyAudioWPatch==0.2.12.7 # Windows only
|
||||
pyinstaller==6.14.1
|
||||
|
||||
@@ -34,7 +34,7 @@ class AudioStream:
|
||||
|
||||
音频样本块大小:{self.CHUNK}
|
||||
样本位宽:{self.SAMP_WIDTH}
|
||||
音频数据格式:{self.FORMAT}
|
||||
采样格式:{self.FORMAT}
|
||||
音频通道数:{self.CHANNELS}
|
||||
音频采样率:{self.RATE}
|
||||
"""
|
||||
|
||||
@@ -65,7 +65,7 @@ class AudioStream:
|
||||
def printInfo(self):
|
||||
dev_info = f"""
|
||||
采样设备:
|
||||
- 设备类型:{ "音频输入" if self.audio_type == 0 else "音频输出" }
|
||||
- 设备类型:{ "音频输出" if self.audio_type == 0 else "音频输入" }
|
||||
- 序号:{self.device['index']}
|
||||
- 名称:{self.device['name']}
|
||||
- 最大输入通道数:{self.device['maxInputChannels']}
|
||||
@@ -76,7 +76,7 @@ class AudioStream:
|
||||
|
||||
音频样本块大小:{self.CHUNK}
|
||||
样本位宽:{self.SAMP_WIDTH}
|
||||
音频数据格式:{self.FORMAT}
|
||||
采样格式:{self.FORMAT}
|
||||
音频通道数:{self.CHANNELS}
|
||||
音频采样率:{self.RATE}
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user