Reputation: 1
I found this script that works fine for real-time speech recognition, but my requirement is to listen the microphone first and when the user stops talking the microphone should be disabled and the audio input should be transcribed in text and I can perform some other actions on that text when all things are done the microphone should be enabled again and the record audio should be executed. I have tried different ways like disable_microphone flag, Using independent functions but nothing worked. Here is the base code that needs to be modified.
import io
from pydub import AudioSegment
import speech_recognition as sr
import whisper
import queue
import tempfile
import os
import threading
import click
import torch
import numpy as np
@click.option("--model", default="base", help="Model to use", type=click.Choice(["tiny","base", "small","medium","large"]))
@click.option("--english", default=False, help="Whether to use English model",is_flag=True, type=bool)
@click.option("--verbose", default=False, help="Whether to print verbose output", is_flag=True,type=bool)
@click.option("--energy", default=300, help="Energy level for mic to detect", type=int)
@click.option("--dynamic_energy", default=False,is_flag=True, help="Flag to enable dynamic engergy", type=bool)
@click.option("--pause", default=0.8, help="Pause time before entry ends", type=float)
@click.option("--save_file",default=False, help="Flag to save file", is_flag=True,type=bool)
def main(model, english,verbose, energy, pause,dynamic_energy,save_file):
temp_dir = tempfile.mkdtemp() if save_file else None
#there are no english models for large
if model != "large" and english:
model = model + ".en"
audio_model = whisper.load_model(model)
audio_queue = queue.Queue()
result_queue = queue.Queue()
args=(audio_queue, energy, pause, dynamic_energy, save_file, temp_dir)).start()
args=(audio_queue, result_queue, audio_model, english, verbose, save_file)).start()
while True:
def record_audio(audio_queue, energy, pause, dynamic_energy, save_file, temp_dir):
#load the speech recognizer and set the initial energy threshold and pause threshold
r = sr.Recognizer()
r.energy_threshold = energy
r.pause_threshold = pause
r.dynamic_energy_threshold = dynamic_energy
with sr.Microphone(sample_rate=16000) as source:
print("Say something!")
i = 0
while True:
#get and save audio to wav file
audio = r.listen(source)
if save_file:
data = io.BytesIO(audio.get_wav_data())
audio_clip = AudioSegment.from_file(data)
filename = os.path.join(temp_dir, f"temp{i}.wav")
audio_clip.export(filename, format="wav")
audio_data = filename
torch_audio = torch.from_numpy(np.frombuffer(audio.get_raw_data(), np.int16).flatten().astype(np.float32) / 32768.0)
audio_data = torch_audio
i += 1
def transcribe_forever(audio_queue, result_queue, audio_model, english, verbose, save_file):
while True:
audio_data = audio_queue.get()
if english:
result = audio_model.transcribe(audio_data,language='english')
result = audio_model.transcribe(audio_data)
if not verbose:
predicted_text = result["text"]
result_queue.put_nowait("You said: " + predicted_text)
if save_file:
For now we can simply ask user either the transcripted text is correct or not? just to create a pause between Turning microphone off and on again. I have tried different ways like disable_microphone flag, Using independent functions instead of Threads to simply record and transcribe the audio but it didn't work also.
Upvotes: 0
Views: 569
Reputation: 1
Voice Recognition programs need to keep listening to work. The program cannot stop listening. Instead,You can use a wake word (like Alexa or Ok Google) to check if the user is interacting with the program or not, then you can run your code.
Upvotes: 0