Python Generate Hash Audio Fingerprinting for Audio Retrieval and Detection – Machine Learning Tutorial

By | November 21, 2022

In this tutorial, we will introduce you how to use python to generate hash audio fingerprinting for audio retrieval and detection. It contains three parts, we will introduce you one by one.

Part 1: Generate hash audio fingerprinting

It is the core of this tutorial, the hash feature of an audio can be generated as follows:

  • read a wav audio and get log stft
  • compute the peak of an audio
  • generate the hash feature based on audio peak

This is the full code, the file name is called file_uti.py

import os
import numpy as np
import librosa
import librosa.display
from matplotlib import pyplot as plt
import hashlib
import pickle

from skimage.feature import peak_local_max

def traverseDir(dir, filetype = ".wav"):
    files = []
    for entry in os.scandir(dir):
        if entry.is_dir():
            files_temp = traverseDir(entry.path, filetype)
            if files_temp:
                files.extend(files_temp)
        elif entry.is_file():
            if entry.path.endswith(filetype):
                files.append(str(entry.path))
    return files

def getFilePathInfo(absolute):
    dirname = os.path.dirname(absolute)
    basename = os.path.basename(absolute)
    info = os.path.splitext(basename)
    filename = info[0]
    extend = info[1]
    return dirname, filename, extend
def save_to_file(label_bo, file = 'label.bin'):
    with open(file, "wb") as f:
        pickle.dump(label_bo, f)
def load_from_file(file = 'label.bin'):
    with open(file,"rb") as f:
        obj = pickle.load(f)
        return obj
def normalise(wave):
    wave = (wave - np.min(wave)) / (np.max(wave) - np.min(wave))
    return wave

def calculate_stft(file, sr = 8000, n_fft = 512, win_length = 200, hop_length = 80, plot=True):
    # read wav file data
    if isinstance(file, str):
        y, sr = librosa.load(file, sr = sr, mono = True)
    else:
        y = file # file is wavdata
    #y = librosa.effects.trim(y)[0]
    # normalise
    signal = normalise(y)
    # compute and plot STFT spectrogram
    D = np.abs(librosa.stft(signal, n_fft=n_fft, window='hann', win_length=win_length, hop_length=hop_length)) + 1e-9
    if plot:
        plt.figure(figsize=(10, 5))
        librosa.display.specshow(librosa.amplitude_to_db(D, ref=np.max), y_axis='linear',
                                 x_axis='time', cmap='gray_r', sr=sr)
        plt.show()
    return D
# create peak and return fasle,true matrix
def calculate_contellation_map(D, min_distance=10, threshold_rel=0.05, plot=True):
    # detect peaks from STFT and plot contellation map
    coordinates = peak_local_max(np.log(D), min_distance=min_distance, threshold_rel=threshold_rel, indices=False)
    # feature coordinates
    print(type(coordinates))
    print(coordinates.shape)
    if plot:
        plt.figure(figsize=(10, 5))
        plt.imshow(coordinates, cmap="RdBu", origin='lower')
        plt.show()

    return coordinates
# create wav hash
def generate_target_hash(file, params):
    # 1.get
    min_distance = params['min_distance']
    threshold_rel = params["threshold_rel"]
    fanout = params["fanout"]
    anchor_distance = params["anchor_distance"]

    sr = params["sr"]
    n_fft = params["n_fft"]
    win_length = params["win_length"]
    hop_length = params["hop_length"]
    plot = params["plot"]

    stft = calculate_stft(file, sr, n_fft, win_length, hop_length, plot = plot)
    coordinates = calculate_contellation_map(stft, min_distance=min_distance, threshold_rel=threshold_rel, plot = plot)
    print(f'Generating Hash {file}...')
    # 2. get file name
    dirname, filename, extend = getFilePathInfo(file)
    target_address = {}
    # 3.iter coordinate
    contellation = np.where(coordinates.T == True)  # a tuple
    if len(contellation[0]) < (anchor_distance + 2): # at least two features
        return target_address

    x = len(contellation[0])
    for i in range(x):
        anchor_time = contellation[0][i]  # time
        anchor_frequency = contellation[1][i]  # frequency

        hash_info = []

        for j in range(fanout):  #
            id = i + j + anchor_distance
            if id >= x:
                break
            time = contellation[0][i + j + anchor_distance]  # 5-33
            frequency = contellation[1][i + j + anchor_distance]
            hash_1 = (frequency, time)

            hash_info.append(hash_1)
        hash_len = len(hash_info)
        for m in range(hash_len-1):
            hash_1 = hash_info[m]
            frequency = hash_1[0]
            time      = hash_1[1]
            hash_1 = str((anchor_frequency, frequency, time - anchor_time))
            hash_2 = hash_info[m+1]
            frequency = hash_2[0]
            time = hash_2[1]
            hash_2 = str((anchor_frequency, frequency, time - anchor_time))

            hash_str = hash_1+"_"+hash_2
            hash_ = hashlib.sha256(hash_str.encode()).hexdigest()

            if filename in target_address:
                if hash_ in target_address[filename]:
                    continue
                target_address[filename].append(hash_)
            else:
                target_address[filename] = [hash_]

    return target_address
#target_address:{filename:[]}
def save_target_hash_feature(feature_file, target_address_list):
    feature = {}
    for target_address in target_address_list:
        for filename in target_address:
            for h in target_address[filename]:
                if h in feature:
                    feature[h].append(filename)
                else:
                    feature[h] = [filename]
    #if not os.path.exists(feature_file):
    save_to_file(feature, feature_file)


def count_list_frequency_proportion(list_data):
    stat_frequency = {}
    stat_proportion = {}
    total = len(list_data)
    for e in list_data:
        if str(e) in stat_frequency:
            stat_frequency[str(e)] += 1
        else:
            stat_frequency[str(e)] = 1
    for key, value in stat_frequency.items():
        stat_proportion[key] = value / total
    return stat_frequency, stat_proportion
def search(query_hash_list, target_hash_dict, top = 3):
    #results
    results = []
    for _hash in query_hash_list:
        if _hash in target_hash_dict:
            files = target_hash_dict[_hash]
            results.extend(files)
    # list
    freq, proportion = count_list_frequency_proportion(results)
    #{'1': 3, '2': 3, '4': 2, '5': 1, '6': 1, '7': 1, '3': 1}
    # sort
    results = sorted(freq.items(), key=lambda x: x[1], reverse=True)
    print(results)

Part 2: Generate hash audio fingerprinting and save

Then we can create hash audio fingerprinting of wav files to save

import os
#import IPython.display as ipd
import numpy as np
import librosa
import librosa.display
from matplotlib import pyplot as plt
from skimage.feature import peak_local_max
import pickle
import hashlib
import file_util

# targetDir: save wav file that will be searched
targetDir = 'database_recordings'

params = {}
params["plot"] = True
params["min_distance"] = 4
params["threshold_rel"] = 0.05
params["anchor_distance"] = 5
params["fanout"] = 28
params["sr"] = 8000
params["n_fft"] = 512
params["win_length"] = int(0.025*params["sr"])
params["hop_length"] = int(0.01*params["sr"])
#--------------------------------------------------

targets = file_util.traverseDir(targetDir)

target_address_list = []
for f in targets:
    hash_feature = file_util.generate_target_hash(f,params)
    target_address_list.append(hash_feature)


#save hash feature
feature_file = "hash_feature.bin"
file_util.save_target_hash_feature(feature_file, target_address_list)

In this example, all wav files are stored in database_recordings directory. We will generate all hash feature of these files and save to hash_feature.bin

Run this code, you will see:

Python Generate Hash Audio Fingerprinting for Audio Retrieval and Detection - Machine Learning Tutorial

Part 3: Search audio based on hash audio fingerprinting

Here is an example code:

import os
#import IPython.display as ipd
import numpy as np
import librosa
import librosa.display
from matplotlib import pyplot as plt
from skimage.feature import peak_local_max
import pickle
import hashlib
import file_util


queryDir = 'query_recordings'

params = {}
params["plot"] = False
params["min_distance"] = 4
params["threshold_rel"] = 0.05
params["anchor_distance"] = 5
params["fanout"] = 28
params["sr"] = 8000
params["n_fft"] = 512
params["win_length"] = int(0.025*params["sr"])
params["hop_length"] = int(0.01*params["sr"])
#--------------------------------------------------

queries = file_util.traverseDir(queryDir)
feature_file = "hash_feature.bin"
target_hash_dict = file_util.load_from_file(feature_file)
for f in queries[0:1]:
    dirname, filename, extend = file_util.getFilePathInfo(f)
    hash_feature = file_util.generate_target_hash(f,params)
    query_hash_list = hash_feature[filename]
    print(type(query_hash_list))

    results = file_util.search(query_hash_list, target_hash_dict, top = 3)

Run run this code, we may see:

<class 'numpy.ndarray'>
(257, 1004)
Generating Hash query_recordings\pop.00085-snippet-10-20.wav...
<class 'list'>
[('pop.00085', 11), ('classical.00005', 2), ('classical.00003', 1), ('classical.00007', 1), ('pop.00086', 1), ('classical.00004', 1), ('classical.00001', 1), ('classical.00002', 1)]

Process finished with exit code 0

We can find pop.00085.wav is same to pop.00085-snippet-10-20.wav. 11 hash features are matched.

Moreover, we also can use VAD to remove silence in wav file. Here is the tutorial:

Simple Guide to Use Python webrtcvad to Remove Silence and Noise in an Audio – Python Tutorial

Leave a Reply