auto-caption/engine-test/vosk.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "6fb12704",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "d:\\Projects\\auto-caption\\caption-engine\\subenv\\Lib\\site-packages\\vosk\\__init__.py\n"
     ]
    }
   ],
   "source": [
    "import vosk\n",
    "print(vosk.__file__)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "63a06f5c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "        采样设备：\n",
      "            - 设备类型：音频输入\n",
      "            - 序号：1\n",
      "            - 名称：麦克风阵列 (Realtek(R) Audio)\n",
      "            - 最大输入通道数：2\n",
      "            - 默认低输入延迟：0.09s\n",
      "            - 默认高输入延迟：0.18s\n",
      "            - 默认采样率：44100.0Hz\n",
      "            - 是否回环设备：False\n",
      "\n",
      "        音频样本块大小：2205\n",
      "        样本位宽：2\n",
      "        采样格式：8\n",
      "        音频通道数：2\n",
      "        音频采样率：44100\n",
      "        \n"
     ]
    }
   ],
   "source": [
    "import sys\n",
    "import os\n",
    "import json\n",
    "from vosk import Model, KaldiRecognizer\n",
    "\n",
    "current_dir = os.getcwd() \n",
    "sys.path.append(os.path.join(current_dir, '../caption-engine'))\n",
    "\n",
    "from sysaudio.win import AudioStream\n",
    "from audioprcs import resampleRawChunk, mergeChunkChannels\n",
    "\n",
    "stream = AudioStream(1)\n",
    "stream.printInfo()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "5d5a0afa",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = Model(os.path.join(\n",
    "    current_dir,\n",
    "    '../caption-engine/models/vosk-model-small-cn-0.22'\n",
    "))\n",
    "recognizer = KaldiRecognizer(model, 16000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7e9d1530",
   "metadata": {},
   "outputs": [],
   "source": [
    "stream.openStream()\n",
    "\n",
    "for i in range(200):\n",
    "    chunk = stream.read_chunk()\n",
    "    chunk_mono = resampleRawChunk(chunk, stream.CHANNELS, stream.RATE, 16000)\n",
    "    if recognizer.AcceptWaveform(chunk_mono):\n",
    "        result = json.loads(recognizer.Result())\n",
    "        print(\"acc:\", result.get(\"text\", \"\"))\n",
    "    else:\n",
    "        partial = json.loads(recognizer.PartialResult())\n",
    "        print(\"else:\", partial.get(\"partial\", \"\"))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "subenv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}