Reputation: 365
My purpose is to silence all the parts of a .wav audio where there is no speech. I am currently using webrtcvad, but what I achieve is just removing the non-speech part from the audio (with their example.py code: https://github.com/wiseman/py-webrtcvad/blob/master/example.py). If someone can point me or show me a how to achieve my goal, I would be grateful! This sounds also sounds like a background noise removal problem.
Upvotes: 1
Views: 2603
Reputation: 6259
Assuming that you want the WAV output to have the same duration as the input, just with the non-speech areas being replaced with silence, and the speech areas unaltered.
The way to do this is to multiply the audio signal with the output from the detector. The detector should output 1.0 for passing though (speech signal), and 0.0 for silencing (non-speech).
Sometimes one uses a small value instead of 0.0 for the blocking part, to just reduce the volume a bit, without making it pure silence. For example 0.01 (-20 dB).
Sometimes an abrupt transition can be a bit rough. In this case one may apply a bit of smoothing or fade. A simple alternative is an exponential moving average.
Below is complete example code in Python that implements this, using the pretrained vad-crdnn-libriparty model from SpeechBrain project.
The code can also be found in this Github repo: https://github.com/jonnor/machinehearing/blob/master/handson/voice-activity-detection/supress.py
import math
import numpy
import pandas
import librosa
import soundfile
from speechbrain.pretrained import VAD
import matplotlib
import matplotlib.pyplot as plt
def detect_voice(
path,
activation_threshold = 0.70,
deactivation_threshold = 0.25,
min_pause = 0.200,
min_activation = 0.100,
save_dir = 'model_dir',
segment_pre = 0.0,
segment_post = 0.0,
double_check_threshold = None,
parallel_chunks = 4,
chunk_size = 1.0,
overlap_chunks = True,
):
# do initial, coarse-detection
vad = VAD.from_hparams(source="speechbrain/vad-crdnn-libriparty", savedir=save_dir)
probabilities = vad.get_speech_prob_file(path,
large_chunk_size=chunk_size*parallel_chunks,
small_chunk_size=chunk_size,
overlap_small_chunk=overlap_chunks)
thresholded = vad.apply_threshold(probabilities,
activation_th=activation_threshold,
deactivation_th=deactivation_threshold).float()
boundaries = vad.get_boundaries(thresholded)
# refine boundaries using energy-based VAD
boundaries = vad.energy_VAD(path, boundaries,
activation_th=activation_threshold,
deactivation_th=deactivation_threshold)
# post-process to clean up
if min_pause is not None:
boundaries = vad.merge_close_segments(boundaries, close_th=min_pause)
if min_activation is not None:
boundaries = vad.remove_short_segments(boundaries, len_th=min_activation)
if double_check_threshold:
boundaries = vad.double_check_speech_segments(boundaries, speech_th=double_check_threshold)
# convert to friendly pandas DataFrames with time info
events = pandas.DataFrame(boundaries, columns=['start', 'end'])
events['class'] = 'speech'
p = numpy.squeeze(probabilities)
times = pandas.Series(numpy.arange(0, len(p)) * vad.time_resolution, name='time')
p = pandas.DataFrame(p, columns=['speech'], index=times)
return p, events
def apply_gain(path, segments, default=0.0, out=None, sr=None):
audio, sr = soundfile.read(path, always_2d=True)
# compute gain curves
gains = numpy.full_like(audio, librosa.db_to_power(default))
for idx, seg in segments.iterrows():
s = math.floor(sr * seg['start'])
e = math.ceil(sr * seg['end'])
gain = librosa.db_to_power(seg['gain'])
gains[s:e, :] = gain
# apply to audio
audio = audio * gains
if out is not None:
soundfile.write(out, audio, samplerate=sr)
return audio, sr
def plot_spectrogram(ax, path, sr=16000, hop_length=1024):
audio, sr = librosa.load(path, sr=sr)
S = librosa.feature.melspectrogram(y=audio, sr=sr, hop_length=hop_length)
S_db = librosa.power_to_db(S, ref=numpy.max)
librosa.display.specshow(ax=ax, data=S_db,
sr=sr, hop_length=hop_length,
x_axis='time', y_axis='mel')
return S_db
def plot_vad(input_path, probabilities, boundaries, output_path):
fig, (input_spec_ax, vad_ax, output_spec_ax) = plt.subplots(3, figsize=(10, 5), sharex=True)
# show spectrogram
plot_spectrogram(ax=input_spec_ax, path=input_path)
# show VAD results
probabilities.reset_index().plot(ax=vad_ax, x='time', y='speech')
for start, end in zip(boundaries['start'], boundaries['end']):
vad_ax.axvspan(start, end, alpha=0.3, color='green')
vad_ax.xaxis.set_minor_locator(matplotlib.ticker.MultipleLocator(1.0))
vad_ax.grid(True, which='minor', axis='x')
vad_ax.grid(True, which='major', axis='x')
# show modified audio
plot_spectrogram(ax=output_spec_ax, path=output_path)
fig.tight_layout()
return fig
# XXX: model only supports 16k samplerate
# If input is another samplerate, have to resample it first
path = 'voiceandnot_16k.wav'
prob, segments = detect_voice(path)
segments['gain'] = 0.0
out_path = 'voice-supressed.wav'
apply_gain(path, segments, default=-20.0, out=out_path)
fig = plot_vad(path, prob, segments, out_path)
fig.savefig('vad-output.png')
Here is an example plot, showing the input data, the VAD activations/segments and the modified output data.
Upvotes: 1