mirror of
https://github.com/HiMeditator/auto-caption.git
synced 2026-02-11 01:54:46 +08:00
feat: 增加了音频输入选项,并优化了字幕引擎的构建和运行流程。
- 新增了系统音频输入(麦克风)的选择功能 - 重构了字幕引擎的构建流程,使用 PyInstaller 打包为可执行文件 - 优化了字幕引擎的启动和停止逻辑 - 更新了用户界面,增加了音频选择的控制选项 - 修改了相关的文件路径和构建配置
This commit is contained in:
48
python-subprocess/main-gummy.py
Normal file
48
python-subprocess/main-gummy.py
Normal file
@@ -0,0 +1,48 @@
|
||||
import sys
|
||||
|
||||
if sys.platform == 'win32':
|
||||
from sysaudio.win import AudioStream, mergeStreamChannels
|
||||
elif sys.platform == 'linux':
|
||||
from sysaudio.linux import AudioStream, mergeStreamChannels
|
||||
else:
|
||||
raise NotImplementedError(f"Unsupported platform: {sys.platform}")
|
||||
|
||||
from audio2text.gummy import GummyTranslator
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
def convert_audio_to_text(s_lang, t_lang, audio_type):
|
||||
sys.stdout.reconfigure(line_buffering=True)
|
||||
stream = AudioStream(audio_type)
|
||||
stream.openStream()
|
||||
|
||||
if t_lang == 'none':
|
||||
gummy = GummyTranslator(stream.RATE, s_lang, None)
|
||||
else:
|
||||
gummy = GummyTranslator(stream.RATE, s_lang, t_lang)
|
||||
gummy.translator.start()
|
||||
|
||||
while True:
|
||||
try:
|
||||
if not stream.stream: continue
|
||||
data = stream.stream.read(stream.CHUNK)
|
||||
data = mergeStreamChannels(data, stream.CHANNELS)
|
||||
gummy.translator.send_audio_frame(data)
|
||||
except KeyboardInterrupt:
|
||||
stream.closeStream()
|
||||
gummy.translator.stop()
|
||||
break
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description='Convert system audio stream to text')
|
||||
parser.add_argument('-s', '--source_language', default='en', help='Source language code')
|
||||
parser.add_argument('-t', '--target_language', default='zh', help='Target language code')
|
||||
parser.add_argument('-a', '--audio_type', default='0', help='Audio stream source: 0 for output audio stream, 1 for input audio stream')
|
||||
args = parser.parse_args()
|
||||
convert_audio_to_text(
|
||||
args.source_language,
|
||||
args.target_language,
|
||||
0 if args.audio_type == '0' else 1
|
||||
)
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
|
||||
a = Analysis(
|
||||
['main.py'],
|
||||
['main-gummy.py'],
|
||||
pathex=[],
|
||||
binaries=[],
|
||||
datas=[],
|
||||
@@ -22,7 +22,7 @@ exe = EXE(
|
||||
a.binaries,
|
||||
a.datas,
|
||||
[],
|
||||
name='main',
|
||||
name='main-gummy',
|
||||
debug=False,
|
||||
bootloader_ignore_signals=False,
|
||||
strip=False,
|
||||
@@ -1,27 +0,0 @@
|
||||
from sysaudio.win import LoopbackStream, mergeStreamChannels
|
||||
from audio2text.gummy import GummyTranslator
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
def convert_audio_to_text(s_lang, t_lang, audio_source):
|
||||
sys.stdout.reconfigure(line_buffering=True)
|
||||
loopback = LoopbackStream()
|
||||
loopback.openStream()
|
||||
|
||||
gummy = GummyTranslator(loopback.RATE, s_lang, t_lang)
|
||||
gummy.translator.start()
|
||||
|
||||
while True:
|
||||
if not loopback.stream: continue
|
||||
data = loopback.stream.read(loopback.CHUNK)
|
||||
data = mergeStreamChannels(data, loopback.CHANNELS)
|
||||
gummy.translator.send_audio_frame(data)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description='Convert system audio stream to text')
|
||||
parser.add_argument('-s', '--s_lang', default='en', help='Source language code')
|
||||
parser.add_argument('-t', '--t_lang', default='zh', help='Target language code')
|
||||
parser.add_argument('-a', '--audio', default=0, help='Audio stream source: 0 for output audio stream, 1 for input audio stream')
|
||||
args = parser.parse_args()
|
||||
convert_audio_to_text(args.s_lang, args.t_lang, args.audio)
|
||||
Binary file not shown.
79
python-subprocess/sysaudio/linux.py
Normal file
79
python-subprocess/sysaudio/linux.py
Normal file
@@ -0,0 +1,79 @@
|
||||
import pyaudio
|
||||
import numpy as np
|
||||
|
||||
def mergeStreamChannels(data, channels):
|
||||
"""
|
||||
将当前多通道流数据合并为单通道流数据
|
||||
|
||||
Args:
|
||||
data: 多通道数据
|
||||
channels: 通道数
|
||||
|
||||
Returns:
|
||||
mono_data_bytes: 单通道数据
|
||||
"""
|
||||
# (length * channels,)
|
||||
data_np = np.frombuffer(data, dtype=np.int16)
|
||||
# (length, channels)
|
||||
data_np_r = data_np.reshape(-1, channels)
|
||||
# (length,)
|
||||
mono_data = np.mean(data_np_r.astype(np.float32), axis=1)
|
||||
mono_data = mono_data.astype(np.int16)
|
||||
mono_data_bytes = mono_data.tobytes()
|
||||
return mono_data_bytes
|
||||
|
||||
|
||||
class AudioStream:
|
||||
def __init__(self, audio_type=1):
|
||||
self.audio_type = audio_type
|
||||
self.mic = pyaudio.PyAudio()
|
||||
self.device = self.mic.get_default_input_device_info()
|
||||
self.stream = None
|
||||
self.SAMP_WIDTH = pyaudio.get_sample_size(pyaudio.paInt16)
|
||||
self.FORMAT = pyaudio.paInt16
|
||||
self.CHANNELS = self.device["maxInputChannels"]
|
||||
self.RATE = int(self.device["defaultSampleRate"])
|
||||
self.CHUNK = self.RATE // 20
|
||||
self.INDEX = self.device["index"]
|
||||
|
||||
def printInfo(self):
|
||||
dev_info = f"""
|
||||
采样输入设备:
|
||||
- 设备类型:{ "音频输入(Linux平台目前仅支持该项)" }
|
||||
- 序号:{self.device['index']}
|
||||
- 名称:{self.device['name']}
|
||||
- 最大输入通道数:{self.device['maxInputChannels']}
|
||||
- 默认低输入延迟:{self.device['defaultLowInputLatency']}s
|
||||
- 默认高输入延迟:{self.device['defaultHighInputLatency']}s
|
||||
- 默认采样率:{self.device['defaultSampleRate']}Hz
|
||||
|
||||
音频样本块大小:{self.CHUNK}
|
||||
样本位宽:{self.SAMP_WIDTH}
|
||||
音频数据格式:{self.FORMAT}
|
||||
音频通道数:{self.CHANNELS}
|
||||
音频采样率:{self.RATE}
|
||||
"""
|
||||
print(dev_info)
|
||||
|
||||
def openStream(self):
|
||||
"""
|
||||
打开并返回系统音频输出流
|
||||
"""
|
||||
if self.stream: return self.stream
|
||||
self.stream = self.mic.open(
|
||||
format = self.FORMAT,
|
||||
channels = self.CHANNELS,
|
||||
rate = self.RATE,
|
||||
input = True,
|
||||
input_device_index = self.INDEX
|
||||
)
|
||||
return self.stream
|
||||
|
||||
def closeStream(self):
|
||||
"""
|
||||
关闭系统音频输出流
|
||||
"""
|
||||
if self.stream is None: return
|
||||
self.stream.stop_stream()
|
||||
self.stream.close()
|
||||
self.stream = None
|
||||
@@ -61,28 +61,39 @@ def mergeStreamChannels(data, channels):
|
||||
mono_data_bytes = mono_data.tobytes()
|
||||
return mono_data_bytes
|
||||
|
||||
class LoopbackStream:
|
||||
def __init__(self):
|
||||
class AudioStream:
|
||||
"""
|
||||
获取系统音频流
|
||||
|
||||
参数:
|
||||
audio_type: (默认)0-系统音频输出流,1-系统音频输入流
|
||||
"""
|
||||
def __init__(self, audio_type=0):
|
||||
self.audio_type = audio_type
|
||||
self.mic = pyaudio.PyAudio()
|
||||
self.loopback = getDefaultLoopbackDevice(self.mic, False)
|
||||
if self.audio_type == 0:
|
||||
self.device = getDefaultLoopbackDevice(self.mic, False)
|
||||
else:
|
||||
self.device = self.mic.get_default_input_device_info()
|
||||
self.stream = None
|
||||
self.SAMP_WIDTH = pyaudio.get_sample_size(pyaudio.paInt16)
|
||||
self.FORMAT = pyaudio.paInt16
|
||||
self.CHANNELS = self.loopback["maxInputChannels"]
|
||||
self.RATE = int(self.loopback["defaultSampleRate"])
|
||||
self.CHANNELS = self.device["maxInputChannels"]
|
||||
self.RATE = int(self.device["defaultSampleRate"])
|
||||
self.CHUNK = self.RATE // 20
|
||||
self.INDEX = self.loopback["index"]
|
||||
self.INDEX = self.device["index"]
|
||||
|
||||
def printInfo(self):
|
||||
dev_info = f"""
|
||||
采样输入设备:
|
||||
- 序号:{self.loopback['index']}
|
||||
- 名称:{self.loopback['name']}
|
||||
- 最大输入通道数:{self.loopback['maxInputChannels']}
|
||||
- 默认低输入延迟:{self.loopback['defaultLowInputLatency']}s
|
||||
- 默认高输入延迟:{self.loopback['defaultHighInputLatency']}s
|
||||
- 默认采样率:{self.loopback['defaultSampleRate']}Hz
|
||||
- 是否回环设备:{self.loopback['isLoopbackDevice']}
|
||||
采样设备:
|
||||
- 设备类型:{ "音频输入" if self.audio_type == 0 else "音频输出" }
|
||||
- 序号:{self.device['index']}
|
||||
- 名称:{self.device['name']}
|
||||
- 最大输入通道数:{self.device['maxInputChannels']}
|
||||
- 默认低输入延迟:{self.device['defaultLowInputLatency']}s
|
||||
- 默认高输入延迟:{self.device['defaultHighInputLatency']}s
|
||||
- 默认采样率:{self.device['defaultSampleRate']}Hz
|
||||
- 是否回环设备:{self.device['isLoopbackDevice']}
|
||||
|
||||
音频样本块大小:{self.CHUNK}
|
||||
样本位宽:{self.SAMP_WIDTH}
|
||||
|
||||
Reference in New Issue
Block a user