From f10530eb6760bba3322983c689b6961ea8ecd568 Mon Sep 17 00:00:00 2001 From: himeditator Date: Fri, 13 Jun 2025 19:54:00 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=96=B9=E6=A1=88=EF=BC=8C?= =?UTF-8?q?=E4=BC=98=E5=85=88=E8=80=83=E8=99=91=20python=20=E5=AE=9E?= =?UTF-8?q?=E7=8E=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + {python => python-prototype}/gummy.ipynb | 0 {python => python-prototype}/requirements.txt | 0 python-subprocess/README.md | 8 +++ python-subprocess/audio2text/gummy.py | 67 +++++++++++++++++++ python-subprocess/main.py | 41 ++++++++++++ python-subprocess/sysaudio/win.py | 53 +++++++++++++++ 7 files changed, 170 insertions(+) rename {python => python-prototype}/gummy.ipynb (100%) rename {python => python-prototype}/requirements.txt (100%) create mode 100644 python-subprocess/README.md create mode 100644 python-subprocess/audio2text/gummy.py create mode 100644 python-subprocess/main.py create mode 100644 python-subprocess/sysaudio/win.py diff --git a/.gitignore b/.gitignore index 75ef73c..2d3a96a 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ out .DS_Store .eslintcache *.log* +__pycache__/ \ No newline at end of file diff --git a/python/gummy.ipynb b/python-prototype/gummy.ipynb similarity index 100% rename from python/gummy.ipynb rename to python-prototype/gummy.ipynb diff --git a/python/requirements.txt b/python-prototype/requirements.txt similarity index 100% rename from python/requirements.txt rename to python-prototype/requirements.txt diff --git a/python-subprocess/README.md b/python-subprocess/README.md new file mode 100644 index 0000000..7d84356 --- /dev/null +++ b/python-subprocess/README.md @@ -0,0 +1,8 @@ +这是项目的 python 实现。使用 Tkinter 创建 GUI。 + +拟实现功能: + +- [ ] 可以获取 Windows 系统音频流 +- [ ] 可以获取 Linux 系统视频流 +- [ ] 添加字幕图形界面 +- [ ] 界面中可以实时显示当前系统音频对应的字幕 diff --git a/python-subprocess/audio2text/gummy.py b/python-subprocess/audio2text/gummy.py new file mode 100644 index 0000000..8300846 --- /dev/null +++ b/python-subprocess/audio2text/gummy.py @@ -0,0 +1,67 @@ +from dashscope.audio.asr import \ + TranslationRecognizerCallback, \ + TranscriptionResult, \ + TranslationResult, \ + TranslationRecognizerRealtime + +class Callback(TranslationRecognizerCallback): + """ + 语音大模型流式传输回调对象 + """ + def __init__(self): + super().__init__() + self.usage = 0 + self.sentences = [] + self.translations = [] + + def on_open(self) -> None: + print("\nGummy 流式翻译开始...\n") + + def on_close(self) -> None: + print(f"\nTokens消耗:{self.usage}") + print(f"流式翻译结束...\n") + for i in range(len(self.sentences)): + print(f"\n{self.sentences[i]}\n{self.translations[i]}\n") + + def on_event( + self, + request_id, + transcription_result: TranscriptionResult, + translation_result: TranslationResult, + usage + ) -> None: + if transcription_result is not None: + id = transcription_result.sentence_id + text = transcription_result.text + if transcription_result.stash is not None: + stash = transcription_result.stash.text + else: + stash = "" + print(f"#{id}: {text}{stash}") + if usage: self.sentences.append(text) + + if translation_result is not None: + lang = translation_result.get_language_list()[0] + text = translation_result.get_translation(lang).text + if translation_result.get_translation(lang).stash is not None: + stash = translation_result.get_translation(lang).stash.text + else: + stash = "" + print(f"#{lang}: {text}{stash}") + if usage: self.translations.append(text) + + if usage: self.usage += usage['duration'] + + +def getGummpyTranslator(rate) -> TranslationRecognizerRealtime: + translator = TranslationRecognizerRealtime( + model = "gummy-realtime-v1", + format = "pcm", + sample_rate = rate, + transcription_enabled = True, + translation_enabled = True, + source_language = "ja", + translation_target_languages = ["zh"], + callback = Callback() + ) + return translator diff --git a/python-subprocess/main.py b/python-subprocess/main.py new file mode 100644 index 0000000..2c58972 --- /dev/null +++ b/python-subprocess/main.py @@ -0,0 +1,41 @@ +from sysaudio.win import getDefaultLoopbackDevice +from audio2text.gummy import getGummpyTranslator +import pyaudiowpatch as pyaudio +import numpy as np + +mic = pyaudio.PyAudio() +loopback = getDefaultLoopbackDevice(mic) + +SAMP_WIDTH = pyaudio.get_sample_size(pyaudio.paInt16) +FORMAT = pyaudio.paInt16 +CHANNELS = loopback["maxInputChannels"] +RATE = int(loopback["defaultSampleRate"]) +CHUNK = RATE // 10 +INDEX = loopback["index"] + + +RECORD_SECONDS = 20 # 监听时长(s) + +stream = mic.open( + format = FORMAT, + channels = CHANNELS, + rate = RATE, + input = True, + input_device_index = INDEX +) + +translator = getGummpyTranslator(rate=RATE) +translator.start() + +for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)): + data = stream.read(CHUNK) + data_np = np.frombuffer(data, dtype=np.int16) + data_np_r = data_np.reshape(-1, CHANNELS) + mono_data = np.mean(data_np_r.astype(np.float32), axis=1) + mono_data = mono_data.astype(np.int16) + mono_data_bytes = mono_data.tobytes() + translator.send_audio_frame(mono_data_bytes) + +translator.stop() +stream.stop_stream() +stream.close() \ No newline at end of file diff --git a/python-subprocess/sysaudio/win.py b/python-subprocess/sysaudio/win.py new file mode 100644 index 0000000..194bf29 --- /dev/null +++ b/python-subprocess/sysaudio/win.py @@ -0,0 +1,53 @@ +"""获取 Windows 系统音频输出流""" + +import pyaudiowpatch as pyaudio + +def getDefaultLoopbackDevice(mic: pyaudio.PyAudio, info = True)->dict: + """ + 获取默认的系统音频输出的回环设备 + Args: + mic (pyaudio.PyAudio): pyaudio对象 + info (bool, optional): 是否打印设备信息 + + Returns: + dict: 系统音频输出的回环设备 + """ + try: + WASAPI_info = mic.get_host_api_info_by_type(pyaudio.paWASAPI) + except OSError: + print("Looks like WASAPI is not available on the system. Exiting...") + exit() + + default_speaker = mic.get_device_info_by_index(WASAPI_info["defaultOutputDevice"]) + if(info): print("wasapi_info:\n", WASAPI_info, "\n") + if(info): print("default_speaker:\n", default_speaker, "\n") + + if not default_speaker["isLoopbackDevice"]: + for loopback in mic.get_loopback_device_info_generator(): + if default_speaker["name"] in loopback["name"]: + default_speaker = loopback + if(info): print("Using loopback device:\n", default_speaker, "\n") + break + else: + print("Default loopback output device not found.") + print("Run `python -m pyaudiowpatch` to check available devices.") + print("Exiting...") + exit() + + if(info): print(f"Output Stream Device: #{default_speaker['index']} {default_speaker['name']}") + return default_speaker + + +def getOutputStream(): + mic = pyaudio.PyAudio() + default_speaker = getDefaultLoopbackDevice(mic, False) + + stream = mic.open( + format = pyaudio.paInt16, + channels = default_speaker["maxInputChannels"], + rate = int(default_speaker["defaultSampleRate"]), + input = True, + input_device_index = default_speaker["index"] + ) + + return stream \ No newline at end of file