85 lines
2.4 KiB
Python
85 lines
2.4 KiB
Python
import sounddevice as sd
|
|
import numpy as np
|
|
import queue
|
|
import threading
|
|
import argparse
|
|
from faster_whisper import WhisperModel
|
|
|
|
audio_queue = queue.Queue()
|
|
|
|
def audio_callback(indata, frames, time, status):
|
|
if status:
|
|
print(f"Audio Status: {status}")
|
|
audio_queue.put(indata.copy())
|
|
|
|
def transcribe_loop():
|
|
"""Listen for chunks of audio, transcribe, and save to file."""
|
|
sample_rate = 16000
|
|
chunk_seconds = 3
|
|
|
|
# Setup whisper
|
|
wmodel = WhisperModel(args.wmodel, device=args.whisperdevice, compute_type="int8")
|
|
|
|
buffer = []
|
|
|
|
print(">> Transcribing")
|
|
while True:
|
|
buffer.append(audio_queue.get()) # get the latest data from the audio queue
|
|
|
|
# check if there is enought audio for transcription
|
|
total_samples = sum(c.shape[0] for c in buffer)
|
|
if total_samples >= sample_rate * chunk_seconds:
|
|
# get the audio data and empty the buffer
|
|
audio_data = np.concatenate(buffer, axis=0).flatten().astype(np.float32)
|
|
buffer = []
|
|
|
|
segments, _ = wmodel.transcribe(
|
|
audio_data,
|
|
language="en",
|
|
vad_filter=True
|
|
)
|
|
|
|
text = " ".join(s.text for s in segments).strip()
|
|
if text:
|
|
print(text)
|
|
|
|
def listen():
|
|
print(">> Listening")
|
|
with sd.InputStream(
|
|
samplerate=16000,
|
|
channels=1,
|
|
dtype="float32",
|
|
blocksize=8000,
|
|
device=None, # default system device
|
|
callback=audio_callback):
|
|
try:
|
|
threading.Event().wait()
|
|
except KeyboardInterrupt:
|
|
print("\nListening Thread Stopped")
|
|
|
|
def main(args):
|
|
|
|
# Start listening thread
|
|
t_listen = threading.Thread(target=listen)
|
|
t_listen.start()
|
|
|
|
# Start transcription thread
|
|
t_transcribe = threading.Thread(target=transcribe_loop)
|
|
t_transcribe.start()
|
|
|
|
try:
|
|
threading.Event().wait()
|
|
except KeyboardInterrupt:
|
|
print("Main Thread Stopped")
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(
|
|
prog='Interprete',
|
|
description='Live EN->FR Voice-to-text Translation',
|
|
epilog='Well... go and use it now.')
|
|
parser.add_argument('-w', '--wmodel', default="base")
|
|
parser.add_argument('--whisperdevice', default="cpu")
|
|
|
|
args = parser.parse_args()
|
|
main(args)
|
|
print("Terminating.")
|