63 lines
1.9 KiB
Python
63 lines
1.9 KiB
Python
import sounddevice as sd
|
|
import numpy as np
|
|
import queue
|
|
import threading
|
|
from faster_whisper import WhisperModel
|
|
|
|
# --- Config ---
|
|
MODEL_SIZE = "base"
|
|
SAMPLE_RATE = 16000 # Whisper expects 16kHz
|
|
CHUNK_SECONDS = 3 # record N seconds, then transcribe
|
|
DEVICE = None # None = default mic, or set to device index
|
|
|
|
# --- Setup ---
|
|
model = WhisperModel(MODEL_SIZE, device="cuda", compute_type="int8")
|
|
audio_queue = queue.Queue()
|
|
|
|
def audio_callback(indata, frames, time, status):
|
|
"""Called by sounddevice for each audio chunk."""
|
|
if status:
|
|
print("Audio status:", status)
|
|
audio_queue.put(indata.copy())
|
|
|
|
def transcribe_loop():
|
|
"""Continuously pull audio from queue and transcribe."""
|
|
print("Listening... (Ctrl+C to stop)\n")
|
|
buffer = []
|
|
|
|
while True:
|
|
chunk = audio_queue.get()
|
|
buffer.append(chunk)
|
|
|
|
# Once we have enough seconds of audio, transcribe
|
|
total_samples = sum(c.shape[0] for c in buffer)
|
|
if total_samples >= SAMPLE_RATE * CHUNK_SECONDS:
|
|
audio_data = np.concatenate(buffer, axis=0).flatten().astype(np.float32)
|
|
buffer = []
|
|
|
|
segments, _ = model.transcribe(
|
|
audio_data,
|
|
language="en",
|
|
vad_filter=True # ignore silence
|
|
)
|
|
|
|
text = " ".join(s.text for s in segments).strip()
|
|
if text:
|
|
print(f">> {text}")
|
|
|
|
# --- Run ---
|
|
t = threading.Thread(target=transcribe_loop, daemon=True)
|
|
t.start()
|
|
|
|
with sd.InputStream(
|
|
samplerate=16000, # audio gets resampled to 16k by ffmpeg anyway
|
|
channels=1,
|
|
dtype="float32",
|
|
blocksize=int(SAMPLE_RATE * 0.5), # 0.5s blocks fed to callback
|
|
device=None, # use default device from system
|
|
callback=audio_callback
|
|
):
|
|
try:
|
|
threading.Event().wait() # block main thread forever
|
|
except KeyboardInterrupt:
|
|
print("\nStopped.")
|