From 3792eb88b6ac90374495ad3a9f64752b1abaeed6 Mon Sep 17 00:00:00 2001 From: himeditator Date: Sat, 26 Jul 2025 23:37:24 +0800 Subject: [PATCH] =?UTF-8?q?refactor(engine):=20=E9=87=8D=E6=9E=84=E5=AD=97?= =?UTF-8?q?=E5=B9=95=E5=BC=95=E6=93=8E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 更新 GummyTranslator 类,优化字幕生成逻辑 - 移除 audioprcs 模块,音频处理功能转移到 utils 模块 - 重构 sysaudio 模块,提高音频流管理的灵活性和稳定性 - 修改 TODO.md,完成按时间降序排列字幕记录的功能 - 更新文档,说明因资源限制将不再维护英文和日文文档 --- .gitignore | 2 +- docs/TODO.md | 2 +- docs/api-docs/caption-engine.md | 51 +++++++++++++++++ docs/engine-manual/en.md | 2 + docs/engine-manual/ja.md | 2 + docs/user-manual/en.md | 2 + docs/user-manual/ja.md | 2 + engine/audio2text/gummy.py | 47 +++++++--------- engine/audioprcs/__init__.py | 1 - engine/main-gummy.py | 21 ++----- engine/main-vosk.py | 17 ++---- engine/sysaudio/__init__.py | 10 ++++ engine/sysaudio/darwin.py | 77 ++++++++++++++++---------- engine/sysaudio/linux.py | 46 +++++++++------ engine/sysaudio/win.py | 57 +++++++++++-------- engine/utils/__init__.py | 2 + engine/{audioprcs => utils}/process.py | 22 ++++---- engine/utils/sysout.py | 18 ++++++ 18 files changed, 245 insertions(+), 136 deletions(-) create mode 100644 docs/api-docs/caption-engine.md delete mode 100644 engine/audioprcs/__init__.py create mode 100644 engine/utils/__init__.py rename engine/{audioprcs => utils}/process.py (75%) create mode 100644 engine/utils/sysout.py diff --git a/.gitignore b/.gitignore index 374c927..81ae53f 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,4 @@ __pycache__ subenv engine/build engine/models -engine/test +engine/notebook diff --git a/docs/TODO.md b/docs/TODO.md index 030c1c3..71bad11 100644 --- a/docs/TODO.md +++ b/docs/TODO.md @@ -15,10 +15,10 @@ - [x] 可以调整字幕时间轴 *2025/07/14* - [x] 可以导出 srt 格式的字幕记录 *2025/07/14* - [x] 可以获取字幕引擎的系统资源消耗情况 *2025/07/15* +- [x] 添加字幕记录按时间降序排列选择 *2025/07/26* ## 待完成 -- [ ] 修改字幕记录展示逻辑 - [ ] 重构字幕引擎 - [ ] 验证 / 添加基于 sherpa-onnx 的字幕引擎 diff --git a/docs/api-docs/caption-engine.md b/docs/api-docs/caption-engine.md new file mode 100644 index 0000000..461db42 --- /dev/null +++ b/docs/api-docs/caption-engine.md @@ -0,0 +1,51 @@ +# caption engine api-doc + +本文档主要 Electron 主进程和字幕引擎进程的通信约定。 + +## 原理说明 + +本项目的 Python 进程通过标准输出向 Electron 主进程发送数据。 + +Python 进程标准输出 (`sys.stdout`) 的内容一定为一行一行的字符串。且每行字符串均可以解释为一个 JSON 对象。每个 JSON 对象一定有 `command` 参数。 + +## 输出约定 + +当 JSON 对象的 `command` 参数为下列值时,表示的对应的含义: + +### `print` + +```js +{ + command: "print", + content: string +} +``` + +输出 Python 端打印的内容。 + +### `info` + +```js +{ + command: "info", + content: string +} +``` + +Python 端打印的提示信息。 + +### `caption` + +```js +{ + command: "caption", + index: number, + time_s: string, + time_t: string, + end: boolean, + text: string, + translation: string +} +``` + +Python 端监听到的音频流转换为的字幕数据。 \ No newline at end of file diff --git a/docs/engine-manual/en.md b/docs/engine-manual/en.md index b060855..78d6e27 100644 --- a/docs/engine-manual/en.md +++ b/docs/engine-manual/en.md @@ -2,6 +2,8 @@ Corresponding Version: v0.5.1 +**Note: Due to limited personal resources, the English and Japanese documentation files for this project (except for the README document) will no longer be maintained. The content of this document may not be consistent with the latest version of the project. If you are willing to help with translation, please submit relevant Pull Requests.** + ![](../../assets/media/structure_en.png) ## Introduction to the Caption Engine diff --git a/docs/engine-manual/ja.md b/docs/engine-manual/ja.md index f1d9abe..4f9efcc 100644 --- a/docs/engine-manual/ja.md +++ b/docs/engine-manual/ja.md @@ -4,6 +4,8 @@ この文書は大規模モデルを使用して翻訳されていますので、内容に正確でない部分があるかもしれません。 +**注意:個人のリソースが限られているため、このプロジェクトの英語および日本語のドキュメント(README ドキュメントを除く)のメンテナンスは行われません。このドキュメントの内容は最新版のプロジェクトと一致しない場合があります。翻訳のお手伝いをしていただける場合は、関連するプルリクエストを提出してください。** + ![](../../assets/media/structure_ja.png) ## 字幕エンジンの紹介 diff --git a/docs/user-manual/en.md b/docs/user-manual/en.md index 5972c60..bbac9d5 100644 --- a/docs/user-manual/en.md +++ b/docs/user-manual/en.md @@ -2,6 +2,8 @@ Corresponding Version: v0.5.1 +**Note: Due to limited personal resources, the English and Japanese documentation files for this project (except for the README document) will no longer be maintained. The content of this document may not be consistent with the latest version of the project. If you are willing to help with translation, please submit relevant Pull Requests.** + ## Software Introduction Auto Caption is a cross-platform caption display software that can real-time capture system audio input (recording) or output (playback) streaming data and use an audio-to-text model to generate captions for the corresponding audio. The default caption engine provided by the software (using Alibaba Cloud Gummy model) supports recognition and translation in nine languages (Chinese, English, Japanese, Korean, German, French, Russian, Spanish, Italian). diff --git a/docs/user-manual/ja.md b/docs/user-manual/ja.md index aee9e47..2350c9b 100644 --- a/docs/user-manual/ja.md +++ b/docs/user-manual/ja.md @@ -4,6 +4,8 @@ この文書は大規模モデルを使用して翻訳されていますので、内容に正確でない部分があるかもしれません。 +**注意:個人のリソースが限られているため、このプロジェクトの英語および日本語のドキュメント(README ドキュメントを除く)のメンテナンスは行われません。このドキュメントの内容は最新版のプロジェクトと一致しない場合があります。翻訳のお手伝いをしていただける場合は、関連するプルリクエストを提出してください。** + ## ソフトウェアの概要 Auto Caption は、クロスプラットフォームの字幕表示ソフトウェアで、システムの音声入力(録音)または出力(音声再生)のストリーミングデータをリアルタイムで取得し、音声からテキストに変換するモデルを利用して対応する音声の字幕を生成します。このソフトウェアが提供するデフォルトの字幕エンジン(アリババクラウド Gummy モデルを使用)は、9つの言語(中国語、英語、日本語、韓国語、ドイツ語、フランス語、ロシア語、スペイン語、イタリア語)の認識と翻訳をサポートしています。 diff --git a/engine/audio2text/gummy.py b/engine/audio2text/gummy.py index ceca937..072f5a2 100644 --- a/engine/audio2text/gummy.py +++ b/engine/audio2text/gummy.py @@ -6,8 +6,8 @@ from dashscope.audio.asr import ( ) import dashscope from datetime import datetime -import json -import sys +from utils import stdout_cmd, stdout_obj + class Callback(TranslationRecognizerCallback): """ @@ -17,15 +17,16 @@ class Callback(TranslationRecognizerCallback): super().__init__() self.usage = 0 self.cur_id = -1 + self.index = 0 self.time_str = '' def on_open(self) -> None: - # print("on_open") - pass + self.cur_id = -1 + self.time_str = '' + stdout_cmd('info', 'Gummy translator started.') def on_close(self) -> None: - # print("on_close") - pass + stdout_cmd('info', 'Gummy translator closed.') def on_event( self, @@ -35,17 +36,18 @@ class Callback(TranslationRecognizerCallback): usage ) -> None: caption = {} + if transcription_result is not None: - caption['index'] = transcription_result.sentence_id - caption['text'] = transcription_result.text - if caption['index'] != self.cur_id: - self.cur_id = caption['index'] - cur_time = datetime.now().strftime('%H:%M:%S.%f')[:-3] - caption['time_s'] = cur_time - self.time_str = cur_time - else: - caption['time_s'] = self.time_str + if self.cur_id != transcription_result.sentence_id: + self.time_str = datetime.now().strftime('%H:%M:%S.%f')[:-3] + self.cur_id = transcription_result.sentence_id + self.index += 1 + caption['command'] = 'caption' + caption['index'] = self.index + caption['time_s'] = self.time_str caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3] + caption['end'] = transcription_result.is_sentence_end + caption['text'] = transcription_result.text caption['translation'] = "" if translation_result is not None: @@ -55,19 +57,8 @@ class Callback(TranslationRecognizerCallback): if usage: self.usage += usage['duration'] - # print(caption) - self.send_to_node(caption) + stdout_obj(caption) - def send_to_node(self, data): - """ - 将数据发送到 Node.js 进程 - """ - try: - json_data = json.dumps(data) + '\n' - sys.stdout.write(json_data) - sys.stdout.flush() - except Exception as e: - print(f"Error sending data to Node.js: {e}", file=sys.stderr) class GummyTranslator: """ @@ -78,7 +69,7 @@ class GummyTranslator: source: 源语言代码字符串(zh, en, ja 等) target: 目标语言代码字符串(zh, en, ja 等) """ - def __init__(self, rate, source, target, api_key): + def __init__(self, rate: int, source: str, target: str | None, api_key: str | None): if api_key: dashscope.api_key = api_key self.translator = TranslationRecognizerRealtime( diff --git a/engine/audioprcs/__init__.py b/engine/audioprcs/__init__.py deleted file mode 100644 index 0d542f8..0000000 --- a/engine/audioprcs/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .process import mergeChunkChannels, resampleRawChunk, resampleMonoChunk diff --git a/engine/main-gummy.py b/engine/main-gummy.py index d4f0ea9..690faae 100644 --- a/engine/main-gummy.py +++ b/engine/main-gummy.py @@ -1,21 +1,11 @@ import sys import argparse - -if sys.platform == 'win32': - from sysaudio.win import AudioStream -elif sys.platform == 'darwin': - from sysaudio.darwin import AudioStream -elif sys.platform == 'linux': - from sysaudio.linux import AudioStream -else: - raise NotImplementedError(f"Unsupported platform: {sys.platform}") - -from audioprcs import mergeChunkChannels +from sysaudio import AudioStream +from utils import merge_chunk_channels from audio2text import InvalidParameter, GummyTranslator def convert_audio_to_text(s_lang, t_lang, audio_type, chunk_rate, api_key): - sys.stdout.reconfigure(line_buffering=True) # type: ignore stream = AudioStream(audio_type, chunk_rate) if t_lang == 'none': @@ -23,20 +13,21 @@ def convert_audio_to_text(s_lang, t_lang, audio_type, chunk_rate, api_key): else: gummy = GummyTranslator(stream.RATE, s_lang, t_lang, api_key) - stream.openStream() + stream.open_stream() gummy.start() while True: try: chunk = stream.read_chunk() - chunk_mono = mergeChunkChannels(chunk, stream.CHANNELS) + if chunk is None: continue + chunk_mono = merge_chunk_channels(chunk, stream.CHANNELS) try: gummy.send_audio_frame(chunk_mono) except InvalidParameter: gummy.start() gummy.send_audio_frame(chunk_mono) except KeyboardInterrupt: - stream.closeStream() + stream.close_stream() gummy.stop() break diff --git a/engine/main-vosk.py b/engine/main-vosk.py index cf407f7..a095909 100644 --- a/engine/main-vosk.py +++ b/engine/main-vosk.py @@ -4,17 +4,9 @@ import argparse from datetime import datetime import numpy.core.multiarray -if sys.platform == 'win32': - from sysaudio.win import AudioStream -elif sys.platform == 'darwin': - from sysaudio.darwin import AudioStream -elif sys.platform == 'linux': - from sysaudio.linux import AudioStream -else: - raise NotImplementedError(f"Unsupported platform: {sys.platform}") - +from sysaudio import AudioStream from vosk import Model, KaldiRecognizer, SetLogLevel -from audioprcs import resampleRawChunk +from utils import resample_chunk_mono SetLogLevel(-1) @@ -30,7 +22,7 @@ def convert_audio_to_text(audio_type, chunk_rate, model_path): recognizer = KaldiRecognizer(model, 16000) stream = AudioStream(audio_type, chunk_rate) - stream.openStream() + stream.open_stream() time_str = '' cur_id = 0 @@ -38,7 +30,8 @@ def convert_audio_to_text(audio_type, chunk_rate, model_path): while True: chunk = stream.read_chunk() - chunk_mono = resampleRawChunk(chunk, stream.CHANNELS, stream.RATE, 16000) + if chunk is None: continue + chunk_mono = resample_chunk_mono(chunk, stream.CHANNELS, stream.RATE, 16000) caption = {} if recognizer.AcceptWaveform(chunk_mono): diff --git a/engine/sysaudio/__init__.py b/engine/sysaudio/__init__.py index e69de29..0bd8205 100644 --- a/engine/sysaudio/__init__.py +++ b/engine/sysaudio/__init__.py @@ -0,0 +1,10 @@ +import sys + +if sys.platform == "win32": + from .win import AudioStream +elif sys.platform == "darwin": + from .darwin import AudioStream +elif sys.platform == "linux": + from .linux import AudioStream +else: + raise NotImplementedError(f"Unsupported platform: {sys.platform}") \ No newline at end of file diff --git a/engine/sysaudio/darwin.py b/engine/sysaudio/darwin.py index 4c5d6dd..a3a8d6e 100644 --- a/engine/sysaudio/darwin.py +++ b/engine/sysaudio/darwin.py @@ -1,11 +1,24 @@ """获取 MacOS 系统音频输入/输出流""" import pyaudio +from textwrap import dedent + + +def get_blackhole_device(mic: pyaudio.PyAudio): + """ + 获取 BlackHole 设备 + """ + device_count = mic.get_device_count() + for i in range(device_count): + dev_info = mic.get_device_info_by_index(i) + if 'blackhole' in str(dev_info["name"]).lower(): + return dev_info + raise Exception("The device containing BlackHole was not found.") class AudioStream: """ - 获取系统音频流(支持 BlackHole 作为系统音频输出捕获) + 获取系统音频流(如果要捕获输出音频,仅支持 BlackHole 作为系统音频输出捕获) 初始化参数: audio_type: 0-系统音频输出流(需配合 BlackHole),1-系统音频输入流 @@ -15,46 +28,40 @@ class AudioStream: self.audio_type = audio_type self.mic = pyaudio.PyAudio() if self.audio_type == 0: - self.device = self.getOutputDeviceInfo() + self.device = get_blackhole_device(self.mic) else: self.device = self.mic.get_default_input_device_info() + self.stop_signal = False self.stream = None - self.SAMP_WIDTH = pyaudio.get_sample_size(pyaudio.paInt16) + self.INDEX = self.device["index"] self.FORMAT = pyaudio.paInt16 - self.CHANNELS = self.device["maxInputChannels"] + self.SAMP_WIDTH = pyaudio.get_sample_size(self.FORMAT) + self.CHANNELS = int(self.device["maxInputChannels"]) self.RATE = int(self.device["defaultSampleRate"]) self.CHUNK = self.RATE // chunk_rate - self.INDEX = self.device["index"] - def getOutputDeviceInfo(self): - """查找指定关键词的输入设备""" - device_count = self.mic.get_device_count() - for i in range(device_count): - dev_info = self.mic.get_device_info_by_index(i) - if 'blackhole' in dev_info["name"].lower(): - return dev_info - raise Exception("The device containing BlackHole was not found.") - - def printInfo(self): + def get_info(self): dev_info = f""" - 采样输入设备: + 采样设备: - 设备类型:{ "音频输出" if self.audio_type == 0 else "音频输入" } - - 序号:{self.device['index']} - - 名称:{self.device['name']} + - 设备序号:{self.device['index']} + - 设备名称:{self.device['name']} - 最大输入通道数:{self.device['maxInputChannels']} - 默认低输入延迟:{self.device['defaultLowInputLatency']}s - 默认高输入延迟:{self.device['defaultHighInputLatency']}s - 默认采样率:{self.device['defaultSampleRate']}Hz + - 是否回环设备:{self.device['isLoopbackDevice']} - 音频样本块大小:{self.CHUNK} + 设备序号:{self.INDEX} + 样本格式:{self.FORMAT} 样本位宽:{self.SAMP_WIDTH} - 采样格式:{self.FORMAT} - 音频通道数:{self.CHANNELS} - 音频采样率:{self.RATE} + 样本通道数:{self.CHANNELS} + 样本采样率:{self.RATE} + 样本块大小:{self.CHUNK} """ - print(dev_info) + return dedent(dev_info).strip() - def openStream(self): + def open_stream(self): """ 打开并返回系统音频输出流 """ @@ -72,14 +79,24 @@ class AudioStream: """ 读取音频数据 """ + if self.stop_signal: + self.close_stream() + return None if not self.stream: return None return self.stream.read(self.CHUNK, exception_on_overflow=False) - def closeStream(self): + def close_stream_signal(self): """ - 关闭系统音频输出流 + 线程安全的关闭系统音频输入流,不一定会立即关闭 """ - if self.stream is None: return - self.stream.stop_stream() - self.stream.close() - self.stream = None + self.stop_signal = True + + def close_stream(self): + """ + 立即关闭系统音频输入流 + """ + if self.stream is not None: + self.stream.stop_stream() + self.stream.close() + self.stream = None + self.stop_signal = False diff --git a/engine/sysaudio/linux.py b/engine/sysaudio/linux.py index 58be353..0a5644a 100644 --- a/engine/sysaudio/linux.py +++ b/engine/sysaudio/linux.py @@ -1,8 +1,10 @@ """获取 Linux 系统音频输入流""" import subprocess +from textwrap import dedent -def findMonitorSource(): + +def find_monitor_source(): result = subprocess.run( ["pactl", "list", "short", "sources"], stdout=subprocess.PIPE, text=True @@ -16,7 +18,8 @@ def findMonitorSource(): raise RuntimeError("System output monitor device not found") -def findInputSource(): + +def find_input_source(): result = subprocess.run( ["pactl", "list", "short", "sources"], stdout=subprocess.PIPE, text=True @@ -28,8 +31,10 @@ def findInputSource(): name = parts[1] if ".monitor" not in name: return name + raise RuntimeError("Microphone input device not found") + class AudioStream: """ 获取系统音频流 @@ -42,34 +47,33 @@ class AudioStream: self.audio_type = audio_type if self.audio_type == 0: - self.source = findMonitorSource() + self.source = find_monitor_source() else: - self.source = findInputSource() - + self.source = find_input_source() + self.stop_signal = False self.process = None - - self.SAMP_WIDTH = 2 self.FORMAT = 16 + self.SAMP_WIDTH = 2 self.CHANNELS = 2 self.RATE = 48000 self.CHUNK = self.RATE // chunk_rate - def printInfo(self): + def get_info(self): dev_info = f""" 音频捕获进程: - 捕获类型:{"音频输出" if self.audio_type == 0 else "音频输入"} - 设备源:{self.source} - - 捕获进程PID:{self.process.pid if self.process else "None"} + - 捕获进程 PID:{self.process.pid if self.process else "None"} - 音频样本块大小:{self.CHUNK} + 样本格式:{self.FORMAT} 样本位宽:{self.SAMP_WIDTH} - 采样格式:{self.FORMAT} - 音频通道数:{self.CHANNELS} - 音频采样率:{self.RATE} + 样本通道数:{self.CHANNELS} + 样本采样率:{self.RATE} + 样本块大小:{self.CHUNK} """ print(dev_info) - def openStream(self): + def open_stream(self): """ 启动音频捕获进程 """ @@ -82,13 +86,23 @@ class AudioStream: """ 读取音频数据 """ - if self.process: + if self.stop_signal: + self.close_stream() + return None + if self.process and self.process.stdout: return self.process.stdout.read(self.CHUNK) return None - def closeStream(self): + def close_stream_signal(self): + """ + 线程安全的关闭系统音频输入流,不一定会立即关闭 + """ + self.stop_signal = True + + def close_stream(self): """ 关闭系统音频捕获进程 """ if self.process: self.process.terminate() + self.stop_signal = False diff --git a/engine/sysaudio/win.py b/engine/sysaudio/win.py index c6765ce..247b434 100644 --- a/engine/sysaudio/win.py +++ b/engine/sysaudio/win.py @@ -1,14 +1,15 @@ """获取 Windows 系统音频输入/输出流""" import pyaudiowpatch as pyaudio +from textwrap import dedent -def getDefaultLoopbackDevice(mic: pyaudio.PyAudio, info = True)->dict: +def get_default_loopback_device(mic: pyaudio.PyAudio, info = True)->dict: """ 获取默认的系统音频输出的回环设备 Args: - mic (pyaudio.PyAudio): pyaudio对象 - info (bool, optional): 是否打印设备信息 + mic: pyaudio对象 + info: 是否打印设备信息 Returns: dict: 系统音频输出的回环设备 @@ -51,38 +52,40 @@ class AudioStream: self.audio_type = audio_type self.mic = pyaudio.PyAudio() if self.audio_type == 0: - self.device = getDefaultLoopbackDevice(self.mic, False) + self.device = get_default_loopback_device(self.mic, False) else: self.device = self.mic.get_default_input_device_info() + self.stop_signal = False self.stream = None - self.SAMP_WIDTH = pyaudio.get_sample_size(pyaudio.paInt16) + self.INDEX = self.device["index"] self.FORMAT = pyaudio.paInt16 + self.SAMP_WIDTH = pyaudio.get_sample_size(self.FORMAT) self.CHANNELS = int(self.device["maxInputChannels"]) self.RATE = int(self.device["defaultSampleRate"]) self.CHUNK = self.RATE // chunk_rate - self.INDEX = self.device["index"] - def printInfo(self): + def get_info(self): dev_info = f""" 采样设备: - 设备类型:{ "音频输出" if self.audio_type == 0 else "音频输入" } - - 序号:{self.device['index']} - - 名称:{self.device['name']} + - 设备序号:{self.device['index']} + - 设备名称:{self.device['name']} - 最大输入通道数:{self.device['maxInputChannels']} - 默认低输入延迟:{self.device['defaultLowInputLatency']}s - 默认高输入延迟:{self.device['defaultHighInputLatency']}s - 默认采样率:{self.device['defaultSampleRate']}Hz - 是否回环设备:{self.device['isLoopbackDevice']} - 音频样本块大小:{self.CHUNK} + 设备序号:{self.INDEX} + 样本格式:{self.FORMAT} 样本位宽:{self.SAMP_WIDTH} - 采样格式:{self.FORMAT} - 音频通道数:{self.CHANNELS} - 音频采样率:{self.RATE} + 样本通道数:{self.CHANNELS} + 样本采样率:{self.RATE} + 样本块大小:{self.CHUNK} """ - print(dev_info) + return dedent(dev_info).strip() - def openStream(self): + def open_stream(self): """ 打开并返回系统音频输出流 """ @@ -96,18 +99,28 @@ class AudioStream: ) return self.stream - def read_chunk(self): + def read_chunk(self) -> bytes | None: """ 读取音频数据 """ + if self.stop_signal: + self.close_stream() + return None if not self.stream: return None return self.stream.read(self.CHUNK, exception_on_overflow=False) - def closeStream(self): + def close_stream_signal(self): """ - 关闭系统音频输出流 + 线程安全的关闭系统音频输入流,不一定会立即关闭 """ - if self.stream is None: return - self.stream.stop_stream() - self.stream.close() - self.stream = None + self.stop_signal = True + + def close_stream(self): + """ + 关闭系统音频输入流 + """ + if self.stream is not None: + self.stream.stop_stream() + self.stream.close() + self.stream = None + self.stop_signal = False diff --git a/engine/utils/__init__.py b/engine/utils/__init__.py new file mode 100644 index 0000000..2589863 --- /dev/null +++ b/engine/utils/__init__.py @@ -0,0 +1,2 @@ +from .process import merge_chunk_channels, resample_chunk_mono, resample_mono_chunk +from .sysout import stdout, stdout_cmd, stdout_obj, stderr \ No newline at end of file diff --git a/engine/audioprcs/process.py b/engine/utils/process.py similarity index 75% rename from engine/audioprcs/process.py rename to engine/utils/process.py index 9298593..01c854d 100644 --- a/engine/audioprcs/process.py +++ b/engine/utils/process.py @@ -1,16 +1,17 @@ import samplerate import numpy as np -def mergeChunkChannels(chunk, channels): + +def merge_chunk_channels(chunk: bytes, channels: int) -> bytes: """ 将当前多通道音频数据块转换为单通道音频数据块 Args: - chunk: (bytes)多通道音频数据块 + chunk: 多通道音频数据块 channels: 通道数 Returns: - (bytes)单通道音频数据块 + 单通道音频数据块 """ # (length * channels,) chunk_np = np.frombuffer(chunk, dtype=np.int16) @@ -22,19 +23,19 @@ def mergeChunkChannels(chunk, channels): return chunk_mono.tobytes() -def resampleRawChunk(chunk, channels, orig_sr, target_sr, mode="sinc_best"): +def resample_chunk_mono(chunk: bytes, channels: int, orig_sr: int, target_sr: int, mode="sinc_best") -> bytes: """ 将当前多通道音频数据块转换成单通道音频数据块,然后进行重采样 Args: - chunk: (bytes)多通道音频数据块 + chunk: 多通道音频数据块 channels: 通道数 orig_sr: 原始采样率 target_sr: 目标采样率 mode: 重采样模式,可选:'sinc_best' | 'sinc_medium' | 'sinc_fastest' | 'zero_order_hold' | 'linear' Return: - (bytes)单通道音频数据块 + 单通道音频数据块 """ # (length * channels,) chunk_np = np.frombuffer(chunk, dtype=np.int16) @@ -44,22 +45,23 @@ def resampleRawChunk(chunk, channels, orig_sr, target_sr, mode="sinc_best"): chunk_mono_f = np.mean(chunk_np.astype(np.float32), axis=1) chunk_mono = chunk_mono_f.astype(np.int16) ratio = target_sr / orig_sr - chunk_mono_r = samplerate.resample(chunk_mono, ratio, converter_type=mode) + chunk_mono_r = samplerate.resample(chunk_mono, ratio, converter_type=mode) chunk_mono_r = np.round(chunk_mono_r).astype(np.int16) return chunk_mono_r.tobytes() -def resampleMonoChunk(chunk, orig_sr, target_sr, mode="sinc_best"): + +def resample_mono_chunk(chunk: bytes, orig_sr: int, target_sr: int, mode="sinc_best") -> bytes: """ 将当前单通道音频块进行重采样 Args: - chunk: (bytes)单通道音频数据块 + chunk: 单通道音频数据块 orig_sr: 原始采样率 target_sr: 目标采样率 mode: 重采样模式,可选:'sinc_best' | 'sinc_medium' | 'sinc_fastest' | 'zero_order_hold' | 'linear' Return: - (bytes)单通道音频数据块 + 单通道音频数据块 """ chunk_np = np.frombuffer(chunk, dtype=np.int16) ratio = target_sr / orig_sr diff --git a/engine/utils/sysout.py b/engine/utils/sysout.py new file mode 100644 index 0000000..574b2cf --- /dev/null +++ b/engine/utils/sysout.py @@ -0,0 +1,18 @@ +import sys +import json + +def stdout(text: str): + stdout_cmd("print", text) + +def stdout_cmd(command: str, content = ""): + msg = { "command": command, "content": content } + sys.stdout.write(json.dumps(msg) + "\n") + sys.stdout.flush() + +def stdout_obj(obj): + sys.stdout.write(json.dumps(obj) + "\n") + sys.stdout.flush() + +def stderr(text: str): + sys.stderr.write(text + "\n") + sys.stderr.flush()