refactor(项目): 尝试 Python 语音识别和内容发送

2026-02-13 19:23:26 +08:00 · 2025-06-17 21:26:16 +08:00
parent 1e83ad2199
commit d1bee65ae1
11 changed files with 158 additions and 357 deletions
--- a/python-subprocess/audio2text/gummy.py
+++ b/python-subprocess/audio2text/gummy.py
@@ -4,6 +4,7 @@ from dashscope.audio.asr import (
    TranslationResult,
    TranslationRecognizerRealtime    
 )
+from datetime import datetime

 class Callback(TranslationRecognizerCallback):
    """
@@ -12,17 +13,15 @@ class Callback(TranslationRecognizerCallback):
    def __init__(self):
        super().__init__()
        self.usage = 0
-        self.sentences = []
-        self.translations = []
+        self.cur_id = -1
+        self.time_str = ''
    
    def on_open(self) -> None:
-        print("\nGummy 流式翻译开始...\n")
+        print("INFO gummy translation start...")

    def on_close(self) -> None:
-        print(f"\nTokens消耗：{self.usage}")
-        print(f"流式翻译结束...\n")
-        for i in range(len(self.sentences)):
-            print(f"\n{self.sentences[i]}\n{self.translations[i]}\n")
+        print(f"INFO tokens useage: {self.usage}")
+        print(f"INFO translation end...")

    def on_event(
        self,
@@ -31,38 +30,37 @@ class Callback(TranslationRecognizerCallback):
        translation_result: TranslationResult,
        usage
    ) -> None:
+        caption = {}
        if transcription_result is not None:
-            id = transcription_result.sentence_id
-            text = transcription_result.text
-            if transcription_result.stash is not None:
-                stash = transcription_result.stash.text
+            caption['id'] = transcription_result.sentence_id
+            caption['text'] = transcription_result.text
+            if caption['id'] != self.cur_id:
+                self.cur_id = caption['id']
+                cur_time = datetime.now().strftime('%H:%M:%S')
+                caption['time_s'] = cur_time
+                self.time_str = cur_time
            else:
-                stash = ""
-            print(f"#{id}: {text}{stash}")
-            if usage: self.sentences.append(text)
+                caption['time_s'] = self.time_str
+            caption['time_t'] = datetime.now().strftime('%H:%M:%S')
+            caption['translation'] = ""
        
        if translation_result is not None:
            lang = translation_result.get_language_list()[0]
-            text = translation_result.get_translation(lang).text
-            if translation_result.get_translation(lang).stash is not None:
-                stash = translation_result.get_translation(lang).stash.text
-            else:
-                stash = ""
-            print(f"#{lang}: {text}{stash}")
-            if usage: self.translations.append(text)
+            caption['translation'] = translation_result.get_translation(lang).text
        
-        if usage: self.usage += usage['duration']
-
+        if usage:
+            self.usage += usage['duration']
+        print(caption)

 class GummyTranslator:
    def __init__(self, rate, source, target):
        self.translator = TranslationRecognizerRealtime(
-        model = "gummy-realtime-v1",
-        format = "pcm",
-        sample_rate = rate,
-        transcription_enabled = True,
-        translation_enabled = True,
-        source_language = source,
-        translation_target_languages = [target],
-        callback = Callback()
-    )
+            model = "gummy-realtime-v1",
+            format = "pcm",
+            sample_rate = rate,
+            transcription_enabled = True,
+            translation_enabled = (target is not None),
+            source_language = source,
+            translation_target_languages = [target],
+            callback = Callback()
+        )