first live transcription test code

2026-02-26 15:54:20 -05:00 · 2026-02-26 15:54:20 -05:00 · 1432c5b400
commit 1432c5b400
2 changed files with 66 additions and 0 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,3 @@
+numpy
+sounddevice
+faster-whisper
--- a/test_live.py
+++ b/test_live.py
@ -0,0 +1,63 @@
+import sounddevice as sd
+import numpy as np
+import queue
+import threading
+from faster_whisper import WhisperModel
+
+# --- Config ---
+MODEL_SIZE = "base"
+SAMPLE_RATE = 16000      # Whisper expects 16kHz
+CHUNK_SECONDS = 3        # record N seconds, then transcribe
+DEVICE = None            # None = default mic, or set to device index
+
+# --- Setup ---
+model = WhisperModel(MODEL_SIZE, device="cpu", compute_type="int8")
+audio_queue = queue.Queue()
+
+def audio_callback(indata, frames, time, status):
+    """Called by sounddevice for each audio chunk."""
+    if status:
+        print("Audio status:", status)
+    audio_queue.put(indata.copy())
+
+def transcribe_loop():
+    """Continuously pull audio from queue and transcribe."""
+    print("Listening... (Ctrl+C to stop)\n")
+    buffer = []
+
+    while True:
+        chunk = audio_queue.get()
+        buffer.append(chunk)
+
+        # Once we have enough seconds of audio, transcribe
+        total_samples = sum(c.shape[0] for c in buffer)
+        if total_samples >= SAMPLE_RATE * CHUNK_SECONDS:
+            audio_data = np.concatenate(buffer, axis=0).flatten().astype(np.float32)
+            buffer = []
+
+            segments, _ = model.transcribe(
+                audio_data,
+                language="en",
+                vad_filter=True  # ignore silence
+            )
+
+            text = " ".join(s.text for s in segments).strip()
+            if text:
+                print(f">> {text}")
+
+# --- Run ---
+t = threading.Thread(target=transcribe_loop, daemon=True)
+t.start()
+
+with sd.InputStream(
+    samplerate=16000, # audio gets resampled to 16k by ffmpeg anyway
+    channels=1,
+    dtype="float32",
+    blocksize=int(SAMPLE_RATE * 0.5),  # 0.5s blocks fed to callback
+    device=None, # use default device from system
+    callback=audio_callback
+):
+    try:
+        threading.Event().wait()  # block main thread forever
+    except KeyboardInterrupt:
+        print("\nStopped.")