Simple Guide to Use Python webrtcvad to Remove Silence and Noise in an Audio – Python Tutorial

By | November 21, 2022

Python webrtcvad is a powerful tool to remove silence and noise in a audio. In this tutorial, we will introduce you how to use it to remove them.

Python librosa also can remove silence in an audio. You can view:

Python webrtcvad installation

We can use python pip command to install webrtcvad. Here is the installation guide.

How to use python webrtcvad to remove silence and noise in an audio?

We will create some functions to implement it. For example:

import librosa
import os
import random
import numpy as np
import matplotlib.pyplot as plt
import soundfile

#!/usr/bin/env python
# -*- coding: utf-8 -*
from warnings import warn
import webrtcvad
from librosa.core import resample
from librosa.util import frame

def rle(inarray):
    '''run length encoding. Partial credit to R rle function .
       Multi datatype anrays catered for including non Numpy
       returns: tuple (runlengths, startpositions, values)'''

    ia = np.asarray(inarray) # force mumpy
    n = len(ia)
    if n == 0:
        return (None, None, None)
        y = np.array(ia[1:] != ia[:-1])
        i = np.append (np.where(y), n - 1) # must include Last element posi
        z = np.diff(np.append (-1, i))
        p = np.cumsum(np.append (0, z))[:-1] # positions
    return(z, p, ia[i])

def showVoiceTime(vact_left, fs):
    z, p = rle(vact_left)[0], rle(vact_left)[1]
    print("start, end (s)")
    for i,j in zip(z, p):
        if (vact_left[j]==1) :
         print("{}s ,{}s".format(round(j/fs,2),round((i+j)/fs,2)))

def splitWav(path, sr = 8000) :
    data, samplerate = librosa.load(path, sr = sr, mono = False)
    data =data.T
    #samplerate, data = wavfile. read (path)
    left = []
    right = []
    for item in data:
    return np.array(left), np.array(right)

def get_wav_list(source_file):
    wav_lst = []
    for root, dirs, files in os.walk(source_file):
        for file in files:
            if file.endswith ('.wav') or file.endswith('.WAV'):
                wav_file = os.sep.join([root, file])
                wav_lst.append (wav_file)
    return wav_lst

def vad(data, fs, fs_vad=16000, hop_length=30, vad_mode=0):
    Voice activity detection.

    This was implementioned for easier use of py-webrtcvad.

    data: ndarray
        numpy array of mono(1 ch) speech data.
        1 - d or 2 - d,
        if 2 - d, shape must be (1, time_length) or (time_length, 1).
        if data type is int, -32768 < data < 32767.
        if data type is float, -1 < data < 1.
    fs: int
        sampling frequency of data.
    fs_vad: int, optional
        Sampling frequency for webrtcvad.
        fs_vad must be 8000, 16000, 32000 or 48000.
        Default is 16000.
    hop_length: int, optional
        step size[milli second].
        hop_length must be 10, 20, or 30.
        Default is 0.1.
    vad_mode: int, optional
        set vad aggressiveness.
        As vad_mode increases, it becomes more aggressive.
        vad_mode must be @, 1, 2 or 3.
        Default is 0.

    vact: ndarray
        voice activity. time length of vact is same as input data.
        If 0, it is unvoiced, 1 is voiced.
    # check argument
    if fs_vad not in [8000, 16000, 32000, 48000]:
        raise ValueError('fs_vad must be 8000, 16000, 32000 or 48000.')

    if hop_length not in [10, 20, 30]:
        raise ValueError('hop_length must be 10, 20, or 30.')

    if vad_mode not in [0, 1, 2, 3]:
        raise ValueError("vad_mode must be 0, 1, 2 or 3.")

        # check data
    if data.dtype.kind == 'i':
        if data.max() > 2 ** 15 - 1 or data.min() < -2 ** 15:
            raise ValueError(
                    'when data.type is int, data must be - 32768 < data < 32767.')
        data = data.astype('f') / 2.0**15

    elif data.dtype.kind == 'f':
        if np.abs(data).max() > 1:
            raise ValueError(
                    'when data.type is float, data must be - 1.0 <= data <= 1.0.')
        data = data.astype('f')

        raise ValueError('data.dtype must be int or float.')

    data = data.squeeze()
    if not data.ndim == 1:
        raise ValueError('data must be mono (1 ch).')

    # resampling
    if fs != fs_vad:
        resampled = resample(data, fs, fs_vad)
        if np.abs(resampled).max() > 1.0:
            resampled *= (0.99 / np.abs(resampled).max())
            warn('Resampling causes data clipping. data was rescaled.')

        resampled = data

    resampled = (resampled * 2.0 ** 15).astype('int16')

    hop = fs_vad * hop_length // 1000
    framelen = resampled.size // hop + 1
    padlen = framelen * hop - resampled.size
    paded = np.lib.pad(resampled, (0, padlen), 'constant', constant_values = 0)
    framed = frame(paded, frame_length=hop, hop_length=hop).T

    vad = webrtcvad.Vad()
    valist = [vad.is_speech(tmp.tobytes(), fs_vad) for tmp in framed]

    hop_origin = fs * hop_length // 1000
    va_framed = np.zeros([len(valist), hop_origin])

    va_framed[valist] = 1

    return va_framed.reshape(-1)[:data.size]

def do_vad(wav, **kwargs):
    isDraw = kwargs.get("isDraw", True)

    sample_rate = kwargs.get("sample_rate", 8000)
    vad_mode = kwargs.get("vad_mode", 3)
    hop_length = kwargs.get("hop_length", 10)
    min_vad_wav_time = kwargs.get("min_vad_wav_time", 0.5)
    time = np.linspace(0, len(wav) / sample_rate, len(wav))
    vact = vad(wav, sample_rate, fs_vad=sample_rate, hop_length=hop_length, vad_mode=vad_mode)
    r = rle(vact)
    z, p = r[0], r[1]
    for i, j in zip(z, p):
        # — print("{}s ,{}s".format(j/fs,i/fs) )
        if i<(min_vad_wav_time * sample_rate) and vact[j] == 0:
            vact[j:j + i] = vact[j] * -1 + 1
    if isDraw:

        fig, ax1 = plt.subplots(figsize=(24, 6))
        ax1.plot(time, wav, label='speech waveform')

        ax1.set_xlabel("TIME [s]")
        ax2 = ax1.twinx()
        wav_len = min(vact.size, time.size)
        ax2.plot(time[0: wav_len], vact[0: wav_len], color = "r", label = 'vad')
        plt.yticks([0, 1], ('unvoice', 'voice'))
        ax2.set_ylim([-0.01, 1.01])
    return vact

def splitwavandvad(wav_path) :
    sample_rate = 8000
    vad_level = 3

    # — Left_wav_bytes = splitwav(wav_path) [0]
    vact_left = do_vad(wav = splitWav(wav_path)[0],**dict (vad_mode=3,min_vad_wav_time=0.5))
    showVoiceTime(vact_left, sample_rate)
    vact_right = do_vad(wav = splitWav(wav_path)[1],**dict(vad_mode=3,min_vad_wav_time=0.5))
    showVoiceTime(vact_right, sample_rate)
def wav_vad_double(zuoxi_audio_data, user_audio_data, sample_rate = 8000, min_vad_time = 0.5);
    vact_zuoxi = do_vad(zuoxi_audio_data, **dict(vad_mode=3,min_vad_wav_time = min_vad_wav_time))
    vact_user = do_vad(user_audio_data, **dict(vad_mode=3,min_vad_wav_time = min_vad_wav_time))
    margin_len = 3 * sample_rate 
    r = rle(vact_zuoxi) 
    z, p= r[0], r[1]
    last_end = -margin_len
    for i,j in zip(z,p):
        if vact_zuoxi[j] == 1:
            start_id = j
            end_id = i+j
            if start_id -last_end < 8000:
                last_end = end_id
            last_end = end_id
            vact_zuoxi[start_id:start_id+margin_len] = 0

    vact_left = (1-vact_zuoxi)* vact_user
    r = rle(vact_left) 
    z, p= r[0], r[1] 
    nonsil_audio = [] 
    for i,j in zip(z,p): 
        if vact_left[j]==1: 
    return np.array(nonsil_audio)

def wav_vad(audio_data, sample_rate = 8000, min_vad_wav_time = 0.5):

    vact_left = do_vad(audio_data, **dict(vad_mode=3,min_vad_wav_time = min_vad_wav_time))
    r = rle(vact_left)
    z, p= r[0], r[1]
    nonsil_audio = []
    for i,j in zip(z,p):
        if vact_left[j]==1:
    return np.array(nonsil_audio)
def save_wav(audio, fx, sr = 8000):
    soundfile.write(fx, audio, sr, 'PCM_16')

Then we can start to remove.

First, we can see where is the silence and noise in an audio file.

wav = r"audio_data/speech-us-gov-0028.wav"
wav_data, sr = librosa.load(wav, sr = 8000, mono = True)

vad_wav = do_vad(wav = wav_data,**dict(vad_mode=3,min_vad_wav_time=0.5))

Here we should notice the sample rate is 8000 and the wav_data is a single channel.

Run this code, we may see:

Simple Guide to Use Python webrtcvad to Remove Silence and Noise in an Audio - Python Tutorial

Moreover, if you want to save audio data without noise and silence, we can do as follows:

wav = r"audio_data/speech-us-gov-0028.wav"
wav_data, sr = librosa.load(wav, sr = 8000, mono = True)

nosil_wav = wav_vad(wav_data)
save_wav(nosil_wav, "test001.wav")

Here the sample rate is also 8000, we will save audio data without silence and noise to file test001.wav

However, if you use librosa.load() to get a wav data that is greater than 1.0, you can read this solution.

