Reputation: 81
Hey I am looking to build a code in python which will recognise what i speak through the micrphone and convert to speech, can you please give me few effcient speech processing libraries for achieving the same??
Upvotes: 8
Views: 10376
Reputation: 1
I would recomend to go for Pyaudio which is efficient in capturing voice you can also built real time speech capturing bu obtaining the speech frames and convert then RMS and from that you can keep a threshold to cpture voice which are above the threshold.
import pyaudio
import sys
import time
import wave
from uuid import uuid4
import numpy as np
print("Speech Recording")
print("Recording when Decibel Level is above Threshold")
print("_ - No Recording (Decibel Level Low)")
print("1 - Recording (Decibel Level High)")
input("Press Enter to continue...")
print("\nMonitor Decibel Level Below:")
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
FRAMES_PER_BUFFER = 320
DECIBEL_THRESHOLD = 60
IDLE_TIME = .5
pa = pyaudio.PyAudio()
stream = pa.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=FRAMES_PER_BUFFER)
inactive_session = False
inactive_since = time.time()
frames = []
while True:
data = stream.read(FRAMES_PER_BUFFER)
rms = np.sqrt(np.mean(np.square(np.frombuffer(data, dtype=np.int16).astype(np.float32))))
decibel_level = 20 * np.log10(rms)
if decibel_level > DECIBEL_THRESHOLD:
sys.stdout.write('1')
sys.stdout.flush()
frames.append(data)
inactive_session = False
else:
sys.stdout.write('_')
sys.stdout.flush()
if not inactive_session:
inactive_session = True
inactive_since = time.time()
if inactive_session and (time.time() - inactive_since) > IDLE_TIME:
if len(frames) > 0:
audio_duration = len(frames) * FRAMES_PER_BUFFER / RATE
if audio_duration > 0:
filename = f'RECORDED-{str(time.time())}-{str(uuid4()).replace("-", "")}.wav'
wf = wave.open(f'recordings/{filename}', 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(pa.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()
print("\nSaved:", filename)
else:
print("\nAudio duration is zero, discarding...")
frames = []
Upvotes: 0
Reputation: 1
import pyaudio
import sys
import time
import wave
from uuid import uuid4
import numpy as np
print("Speech Recording")
print("Recording when Decibel Level is above Threshold")
print("_ - No Recording (Decibel Level Low)")
print("1 - Recording (Decibel Level High)")
input("Press Enter to continue...")
print("\nMonitor Decibel Level Below:")
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
FRAMES_PER_BUFFER = 320
DECIBEL_THRESHOLD = 60
IDLE_TIME = .5
pa = pyaudio.PyAudio()
stream = pa.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=FRAMES_PER_BUFFER)
inactive_session = False
inactive_since = time.time()
frames = []
while True:
data = stream.read(FRAMES_PER_BUFFER)
rms = np.sqrt(np.mean(np.square(np.frombuffer(data, dtype=np.int16).astype(np.float32))))
decibel_level = 20 * np.log10(rms)
if decibel_level > DECIBEL_THRESHOLD:
sys.stdout.write('1')
sys.stdout.flush()
frames.append(data)
inactive_session = False
else:
sys.stdout.write('_')
sys.stdout.flush()
if not inactive_session:
inactive_session = True
inactive_since = time.time()
if inactive_session and (time.time() - inactive_since) > IDLE_TIME:
if len(frames) > 0:
audio_duration = len(frames) * FRAMES_PER_BUFFER / RATE
if audio_duration > 0:
filename = f'RECORDED-{str(time.time())}-{str(uuid4()).replace("-", "")}.wav'
wf = wave.open(f'recordings/{filename}', 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(pa.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()
print("\nSaved:", filename)
else:
print("\nAudio duration is zero, discarding...")
frames = []
Upvotes: 0
Reputation: 1248
The dragonfly sample code misses out on a snippet while providing the code example at https://pythonhosted.org/dragonfly/
from dragonfly.all import Grammar, CompoundRule
# Voice command rule combining spoken form and recognition processing.
class ExampleRule(CompoundRule):
spec = "do something computer" # Spoken form of command.
def _process_recognition(self, node, extras): # Callback when command is spoken.
print "Voice command spoken."
# Create a grammar which contains and loads the command rule.
grammar = Grammar("example grammar") # Create a grammar to contain the command rule.
grammar.add_rule(ExampleRule()) # Add the command rule to the grammar.
grammar.load() # Load the grammar.
should be followed by
import time
import pythoncom
while True:
pythoncom.PumpWaitingMessages()
time.sleep(.1)
as mentioned here - http://dragonfly.googlecode.com/svn-history/r46/trunk/dragonfly/examples/dragonfly-main.py
Upvotes: 0
Reputation: 13287
See pyspeech (python) - Transcribe mp3 files?
which talks about http://code.google.com/p/pyspeech/. You may also want to look at http://code.google.com/p/dragonfly/
Upvotes: 4