mirror of
https://github.com/HiMeditator/auto-caption.git
synced 2026-04-08 13:19:39 +08:00
feat(python-subprocess): 尝试字幕显示新解决方案
- 使用 python 子进程解析字幕 - 通过 websocket 通信将字幕传递给软件
This commit is contained in:
@@ -2,7 +2,11 @@
|
||||
|
||||
拟实现功能:
|
||||
|
||||
- [ ] 可以获取 Windows 系统音频流
|
||||
- [x] 可以获取 Windows 系统音频流
|
||||
- [ ] 可以对音频流进行转换(调整声道数和采样率)
|
||||
- [ ] 可以获取 Linux 系统视频流
|
||||
- [ ] 添加字幕图形界面
|
||||
- [ ] 界面中可以实时显示当前系统音频对应的字幕
|
||||
- [ ] 字幕显示
|
||||
- [ ] 字幕样式设置
|
||||
- [ ] 字幕页面删除标题栏
|
||||
- [ ] 界面中实时显示当前系统音频对应的字幕
|
||||
|
||||
223
python-subprocess/audio2caption.py
Normal file
223
python-subprocess/audio2caption.py
Normal file
@@ -0,0 +1,223 @@
|
||||
import pyaudiowpatch as pyaudio
|
||||
import numpy as np
|
||||
import tkinter as tk
|
||||
from tkinter import ttk
|
||||
from dashscope.audio.asr import (
|
||||
TranslationRecognizerCallback,
|
||||
TranslationRecognizerRealtime
|
||||
)
|
||||
import threading
|
||||
import queue
|
||||
|
||||
class AudioCapture:
|
||||
def __init__(self):
|
||||
self.audio = pyaudio.PyAudio()
|
||||
self.stream = None
|
||||
self.is_running = False
|
||||
self.setup_audio()
|
||||
|
||||
def setup_audio(self):
|
||||
try:
|
||||
wasapi_info = self.audio.get_host_api_info_by_type(pyaudio.paWASAPI)
|
||||
except OSError:
|
||||
raise Exception("WASAPI 不可用")
|
||||
|
||||
default_speaker = self.audio.get_device_info_by_index(wasapi_info["defaultOutputDevice"])
|
||||
|
||||
if not default_speaker["isLoopbackDevice"]:
|
||||
for loopback in self.audio.get_loopback_device_info_generator():
|
||||
if default_speaker["name"] in loopback["name"]:
|
||||
default_speaker = loopback
|
||||
break
|
||||
else:
|
||||
raise Exception("未找到默认回环输出设备")
|
||||
|
||||
self.device_info = default_speaker
|
||||
self.channels = default_speaker["maxInputChannels"]
|
||||
self.rate = int(default_speaker["defaultSampleRate"])
|
||||
self.chunk = self.rate // 10
|
||||
|
||||
def start_stream(self):
|
||||
self.stream = self.audio.open(
|
||||
format=pyaudio.paInt16,
|
||||
channels=self.channels,
|
||||
rate=self.rate,
|
||||
input=True,
|
||||
input_device_index=self.device_info["index"]
|
||||
)
|
||||
self.is_running = True
|
||||
|
||||
def stop_stream(self):
|
||||
if self.stream:
|
||||
self.is_running = False
|
||||
self.stream.stop_stream()
|
||||
self.stream.close()
|
||||
self.audio.terminate()
|
||||
|
||||
class CaptionCallback(TranslationRecognizerCallback):
|
||||
def __init__(self, text_queue):
|
||||
super().__init__()
|
||||
self.text_queue = text_queue
|
||||
self.usage = 0
|
||||
|
||||
def on_open(self) -> None:
|
||||
self.text_queue.put(("status", "开始识别..."))
|
||||
|
||||
def on_close(self) -> None:
|
||||
self.text_queue.put(("status", f"识别结束,消耗 Tokens: {self.usage}"))
|
||||
|
||||
def on_event(self, request_id, transcription_result, translation_result, usage) -> None:
|
||||
if transcription_result is not None:
|
||||
text = transcription_result.text
|
||||
if transcription_result.stash is not None:
|
||||
text += transcription_result.stash.text
|
||||
self.text_queue.put(("caption", text))
|
||||
|
||||
if translation_result is not None:
|
||||
lang = translation_result.get_language_list()[0]
|
||||
text = translation_result.get_translation(lang).text
|
||||
if translation_result.get_translation(lang).stash is not None:
|
||||
text += translation_result.get_translation(lang).stash.text
|
||||
self.text_queue.put(("translation", text))
|
||||
|
||||
if usage:
|
||||
self.usage += usage['duration']
|
||||
|
||||
class CaptionApp:
|
||||
def __init__(self):
|
||||
self.root = tk.Tk()
|
||||
self.root.title("实时字幕")
|
||||
self.root.geometry("800x400")
|
||||
|
||||
self.setup_ui()
|
||||
self.text_queue = queue.Queue()
|
||||
self.audio_capture = AudioCapture()
|
||||
self.translator = None
|
||||
self.is_running = False
|
||||
# 添加字幕缓存
|
||||
self.caption_cache = []
|
||||
self.translation_cache = []
|
||||
|
||||
def setup_ui(self):
|
||||
# 状态标签
|
||||
self.status_label = ttk.Label(self.root, text="就绪")
|
||||
self.status_label.pack(pady=5)
|
||||
|
||||
# 字幕显示区域
|
||||
self.caption_frame = ttk.Frame(self.root)
|
||||
self.caption_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=5)
|
||||
|
||||
# 创建两个标签用于显示字幕和翻译
|
||||
self.caption_label1 = ttk.Label(self.caption_frame, text="", font=("Arial", 14))
|
||||
self.caption_label1.pack(fill=tk.X, pady=5)
|
||||
self.translation_label1 = ttk.Label(self.caption_frame, text="", font=("Arial", 14))
|
||||
self.translation_label1.pack(fill=tk.X, pady=5)
|
||||
|
||||
self.caption_label2 = ttk.Label(self.caption_frame, text="", font=("Arial", 14))
|
||||
self.caption_label2.pack(fill=tk.X, pady=5)
|
||||
self.translation_label2 = ttk.Label(self.caption_frame, text="", font=("Arial", 14))
|
||||
self.translation_label2.pack(fill=tk.X, pady=5)
|
||||
|
||||
# 控制按钮
|
||||
self.control_frame = ttk.Frame(self.root)
|
||||
self.control_frame.pack(pady=10)
|
||||
|
||||
self.start_button = ttk.Button(self.control_frame, text="开始", command=self.start_caption)
|
||||
self.start_button.pack(side=tk.LEFT, padx=5)
|
||||
|
||||
self.stop_button = ttk.Button(self.control_frame, text="停止", command=self.stop_caption, state=tk.DISABLED)
|
||||
self.stop_button.pack(side=tk.LEFT, padx=5)
|
||||
|
||||
def start_caption(self):
|
||||
self.is_running = True
|
||||
self.start_button.config(state=tk.DISABLED)
|
||||
self.stop_button.config(state=tk.NORMAL)
|
||||
|
||||
# 初始化翻译器
|
||||
self.translator = TranslationRecognizerRealtime(
|
||||
model="gummy-realtime-v1",
|
||||
format="pcm",
|
||||
sample_rate=self.audio_capture.rate,
|
||||
transcription_enabled=True,
|
||||
translation_enabled=True,
|
||||
source_language="ja",
|
||||
translation_target_languages=["zh"],
|
||||
callback=CaptionCallback(self.text_queue)
|
||||
)
|
||||
|
||||
# 启动音频捕获和翻译
|
||||
self.audio_capture.start_stream()
|
||||
self.translator.start()
|
||||
|
||||
# 启动处理线程
|
||||
threading.Thread(target=self.process_audio, daemon=True).start()
|
||||
threading.Thread(target=self.update_ui, daemon=True).start()
|
||||
|
||||
def stop_caption(self):
|
||||
self.is_running = False
|
||||
self.start_button.config(state=tk.NORMAL)
|
||||
self.stop_button.config(state=tk.DISABLED)
|
||||
|
||||
if self.translator:
|
||||
self.translator.stop()
|
||||
self.audio_capture.stop_stream()
|
||||
|
||||
def process_audio(self):
|
||||
while self.is_running:
|
||||
try:
|
||||
data = self.audio_capture.stream.read(self.audio_capture.chunk)
|
||||
data_np = np.frombuffer(data, dtype=np.int16)
|
||||
data_np_r = data_np.reshape(-1, self.audio_capture.channels)
|
||||
mono_data = np.mean(data_np_r.astype(np.float32), axis=1)
|
||||
mono_data = mono_data.astype(np.int16)
|
||||
mono_data_bytes = mono_data.tobytes()
|
||||
self.translator.send_audio_frame(mono_data_bytes)
|
||||
except Exception as e:
|
||||
self.text_queue.put(("error", str(e)))
|
||||
break
|
||||
|
||||
def update_caption_display(self):
|
||||
# 更新字幕显示
|
||||
if len(self.caption_cache) > 0:
|
||||
self.caption_label1.config(text=self.caption_cache[-1])
|
||||
if len(self.caption_cache) > 1:
|
||||
self.caption_label2.config(text=self.caption_cache[-2])
|
||||
else:
|
||||
self.caption_label2.config(text="")
|
||||
|
||||
# 更新翻译显示
|
||||
if len(self.translation_cache) > 0:
|
||||
self.translation_label1.config(text=f"翻译: {self.translation_cache[-1]}")
|
||||
if len(self.translation_cache) > 1:
|
||||
self.translation_label2.config(text=f"翻译: {self.translation_cache[-2]}")
|
||||
else:
|
||||
self.translation_label2.config(text="")
|
||||
|
||||
def update_ui(self):
|
||||
while self.is_running:
|
||||
try:
|
||||
msg_type, text = self.text_queue.get(timeout=0.1)
|
||||
if msg_type == "status":
|
||||
self.status_label.config(text=text)
|
||||
elif msg_type == "caption":
|
||||
self.caption_cache.append(text)
|
||||
if len(self.caption_cache) > 2:
|
||||
self.caption_cache.pop(0)
|
||||
self.update_caption_display()
|
||||
elif msg_type == "translation":
|
||||
self.translation_cache.append(text)
|
||||
if len(self.translation_cache) > 2:
|
||||
self.translation_cache.pop(0)
|
||||
self.update_caption_display()
|
||||
elif msg_type == "error":
|
||||
self.status_label.config(text=f"错误: {text}")
|
||||
self.stop_caption()
|
||||
except queue.Empty:
|
||||
continue
|
||||
|
||||
def run(self):
|
||||
self.root.mainloop()
|
||||
|
||||
if __name__ == "__main__":
|
||||
app = CaptionApp()
|
||||
app.run()
|
||||
@@ -1,8 +1,9 @@
|
||||
from dashscope.audio.asr import \
|
||||
TranslationRecognizerCallback, \
|
||||
TranscriptionResult, \
|
||||
TranslationResult, \
|
||||
TranslationRecognizerRealtime
|
||||
from dashscope.audio.asr import (
|
||||
TranslationRecognizerCallback,
|
||||
TranscriptionResult,
|
||||
TranslationResult,
|
||||
TranslationRecognizerRealtime
|
||||
)
|
||||
|
||||
class Callback(TranslationRecognizerCallback):
|
||||
"""
|
||||
@@ -53,15 +54,15 @@ class Callback(TranslationRecognizerCallback):
|
||||
if usage: self.usage += usage['duration']
|
||||
|
||||
|
||||
def getGummpyTranslator(rate) -> TranslationRecognizerRealtime:
|
||||
translator = TranslationRecognizerRealtime(
|
||||
class GummyTranslator:
|
||||
def __init__(self, rate, source, target):
|
||||
self.translator = TranslationRecognizerRealtime(
|
||||
model = "gummy-realtime-v1",
|
||||
format = "pcm",
|
||||
sample_rate = rate,
|
||||
transcription_enabled = True,
|
||||
translation_enabled = True,
|
||||
source_language = "ja",
|
||||
translation_target_languages = ["zh"],
|
||||
source_language = source,
|
||||
translation_target_languages = [target],
|
||||
callback = Callback()
|
||||
)
|
||||
return translator
|
||||
)
|
||||
@@ -1,41 +1,17 @@
|
||||
from sysaudio.win import getDefaultLoopbackDevice
|
||||
from audio2text.gummy import getGummpyTranslator
|
||||
import pyaudiowpatch as pyaudio
|
||||
import numpy as np
|
||||
from sysaudio.win import LoopbackStream, mergeStreamChannels
|
||||
from audio2text.gummy import GummyTranslator
|
||||
|
||||
mic = pyaudio.PyAudio()
|
||||
loopback = getDefaultLoopbackDevice(mic)
|
||||
loopback = LoopbackStream()
|
||||
loopback.openStream()
|
||||
|
||||
SAMP_WIDTH = pyaudio.get_sample_size(pyaudio.paInt16)
|
||||
FORMAT = pyaudio.paInt16
|
||||
CHANNELS = loopback["maxInputChannels"]
|
||||
RATE = int(loopback["defaultSampleRate"])
|
||||
CHUNK = RATE // 10
|
||||
INDEX = loopback["index"]
|
||||
gummy = GummyTranslator(loopback.RATE, "ja", "zh")
|
||||
gummy.translator.start()
|
||||
|
||||
for i in range(0, 100):
|
||||
if not loopback.stream: continue
|
||||
data = loopback.stream.read(loopback.CHUNK)
|
||||
data = mergeStreamChannels(data, loopback.CHANNELS)
|
||||
gummy.translator.send_audio_frame(data)
|
||||
|
||||
RECORD_SECONDS = 20 # 监听时长(s)
|
||||
|
||||
stream = mic.open(
|
||||
format = FORMAT,
|
||||
channels = CHANNELS,
|
||||
rate = RATE,
|
||||
input = True,
|
||||
input_device_index = INDEX
|
||||
)
|
||||
|
||||
translator = getGummpyTranslator(rate=RATE)
|
||||
translator.start()
|
||||
|
||||
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
|
||||
data = stream.read(CHUNK)
|
||||
data_np = np.frombuffer(data, dtype=np.int16)
|
||||
data_np_r = data_np.reshape(-1, CHANNELS)
|
||||
mono_data = np.mean(data_np_r.astype(np.float32), axis=1)
|
||||
mono_data = mono_data.astype(np.int16)
|
||||
mono_data_bytes = mono_data.tobytes()
|
||||
translator.send_audio_frame(mono_data_bytes)
|
||||
|
||||
translator.stop()
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
gummy.translator.stop()
|
||||
loopback.closeStream()
|
||||
@@ -1,6 +1,8 @@
|
||||
"""获取 Windows 系统音频输出流"""
|
||||
|
||||
import pyaudiowpatch as pyaudio
|
||||
import numpy as np
|
||||
|
||||
|
||||
def getDefaultLoopbackDevice(mic: pyaudio.PyAudio, info = True)->dict:
|
||||
"""
|
||||
@@ -38,16 +40,77 @@ def getDefaultLoopbackDevice(mic: pyaudio.PyAudio, info = True)->dict:
|
||||
return default_speaker
|
||||
|
||||
|
||||
def getOutputStream():
|
||||
mic = pyaudio.PyAudio()
|
||||
default_speaker = getDefaultLoopbackDevice(mic, False)
|
||||
def mergeStreamChannels(data, channels):
|
||||
"""
|
||||
将当前多通道流数据合并为单通道流数据
|
||||
|
||||
stream = mic.open(
|
||||
format = pyaudio.paInt16,
|
||||
channels = default_speaker["maxInputChannels"],
|
||||
rate = int(default_speaker["defaultSampleRate"]),
|
||||
input = True,
|
||||
input_device_index = default_speaker["index"]
|
||||
)
|
||||
Args:
|
||||
data: 多通道数据
|
||||
channels: 通道数
|
||||
|
||||
return stream
|
||||
Returns:
|
||||
mono_data_bytes: 单通道数据
|
||||
"""
|
||||
# (length * channels,)
|
||||
data_np = np.frombuffer(data, dtype=np.int16)
|
||||
# (length, channels)
|
||||
data_np_r = data_np.reshape(-1, channels)
|
||||
# (length,)
|
||||
mono_data = np.mean(data_np_r.astype(np.float32), axis=1)
|
||||
mono_data = mono_data.astype(np.int16)
|
||||
mono_data_bytes = mono_data.tobytes()
|
||||
return mono_data_bytes
|
||||
|
||||
class LoopbackStream:
|
||||
def __init__(self):
|
||||
self.mic = pyaudio.PyAudio()
|
||||
self.loopback = getDefaultLoopbackDevice(self.mic, False)
|
||||
self.stream = None
|
||||
self.SAMP_WIDTH = pyaudio.get_sample_size(pyaudio.paInt16)
|
||||
self.FORMAT = pyaudio.paInt16
|
||||
self.CHANNELS = self.loopback["maxInputChannels"]
|
||||
self.RATE = int(self.loopback["defaultSampleRate"])
|
||||
self.CHUNK = self.RATE // 10
|
||||
self.INDEX = self.loopback["index"]
|
||||
|
||||
def printInfo(self):
|
||||
dev_info = f"""
|
||||
采样输入设备:
|
||||
- 序号:{self.loopback['index']}
|
||||
- 名称:{self.loopback['name']}
|
||||
- 最大输入通道数:{self.loopback['maxInputChannels']}
|
||||
- 默认低输入延迟:{self.loopback['defaultLowInputLatency']}s
|
||||
- 默认高输入延迟:{self.loopback['defaultHighInputLatency']}s
|
||||
- 默认采样率:{self.loopback['defaultSampleRate']}Hz
|
||||
- 是否回环设备:{self.loopback['isLoopbackDevice']}
|
||||
|
||||
音频样本块大小:{self.CHUNK}
|
||||
样本位宽:{self.SAMP_WIDTH}
|
||||
音频数据格式:{self.FORMAT}
|
||||
音频通道数:{self.CHANNELS}
|
||||
音频采样率:{self.RATE}
|
||||
"""
|
||||
print(dev_info)
|
||||
|
||||
def openStream(self):
|
||||
"""
|
||||
打开并返回系统音频输出流
|
||||
"""
|
||||
if self.stream: return self.stream
|
||||
self.stream = self.mic.open(
|
||||
format = self.FORMAT,
|
||||
channels = self.CHANNELS,
|
||||
rate = self.RATE,
|
||||
input = True,
|
||||
input_device_index = self.INDEX
|
||||
)
|
||||
return self.stream
|
||||
|
||||
def closeStream(self):
|
||||
"""
|
||||
关闭系统音频输出流
|
||||
"""
|
||||
if self.stream is None: return
|
||||
self.stream.stop_stream()
|
||||
self.stream.close()
|
||||
self.stream = None
|
||||
Reference in New Issue
Block a user