When we are building a face or speech recognition model, we have to use many image and audio files to train our model. However, we can not load all data to memory. In this situation, we have to save data in our local disk and load some data to memory when training.
Save image or audio data to npy file
In order to speed up data loading, we may read image or audio data to numpy array, then use numpy to save and load these data to memory.
We can use numpy.save() and numpy.load() data.
However, this method may save many npy files in our disk, this is not a good method.
Use LMDB to save audio and image data
LMDB is light database, it is very useful to save key:value data.
Install lmdb in python
In order to use lmdb, we can use pip to install it.
pip install -i https://mirrors.aliyun.com/pypi/simple/ lmdb --trusted-host mirrors.aliyun.com
- pip install -i https://mirrors.aliyun.com/pypi/simple/ lmdb --trusted-host mirrors.aliyun.com
pip install -i https://mirrors.aliyun.com/pypi/simple/ lmdb --trusted-host mirrors.aliyun.com
In order to use lmdb to save key:value or key:(value, label) data, we can use example below:
import lmdb
import os, sys
import pickle
class LMDB:
def __init__(self, path = "dataset"):
self.path = path
self.initialize()
def initialize(self):
self.env = None
try:
self.env = lmdb.open(self.path, map_size = 104857600*40 ) #104857600 100M
except Exception as e:
print(e)
self.env = None
if self.env is None:
print("open lmdb fail")
quit(0)
def insert_with_label(self, key, value, label):
try:
obj = pickle.dumps((value, label))
print(len(obj)/104857600)
self.insert(key, obj)
except Exception as e:
print(e)
return False
return True
def insert(self, key, value):
try:
#key is string, value is byte
txn = self.env.begin(write=True)
if isinstance(value, bytes):
txn.put(key.encode(), value)
else:
txn.put(key.encode(), value.encode())
txn.commit()
except Exception as e:
print(e)
return False
return True
def delete(self, key):
try:
txn = self.env.begin(write=True)
txn.delete(key.encode())
txn.commit()
except Exception as e:
print(e)
def get(self, key):
# key is str
txn = self.env.begin()
name = txn.get(key.encode())
return name
def get_with_label(self, key):
obj = self.get(key)
value, label = pickle.loads(obj)
print(type(value), type(label))
return value, label
def length(self):
stat = self.env.stat()
#{'psize': 4096, 'depth': 1, 'branch_pages': 0, 'leaf_pages': 1, 'overflow_pages': 689200, 'entries': 100}
return stat["entries"]
def display(self):
txn = self.env.begin()
cur = txn.cursor()
for key, value in cur:
print(key.decode("utf-8"))
value, label = self.get_with_label(key.decode("utf-8"))
print(label)
def close(self):
self.env.close()
import librosa
wav_file = "music-jamendo-0039.wav"
wav_data, sr = librosa.load(wav_file, mono=True)
db = LMDB(path = "wavdb")
for i in range(100):
key = str(i)
value = wav_data
label = str(key)
db.insert_with_label(key, value, label)
print(db.length())
db.display()
db.close()
- import lmdb
- import os, sys
- import pickle
- class LMDB:
- def __init__(self, path = "dataset"):
- self.path = path
- self.initialize()
- def initialize(self):
- self.env = None
- try:
- self.env = lmdb.open(self.path, map_size = 104857600*40 ) #104857600 100M
- except Exception as e:
- print(e)
- self.env = None
- if self.env is None:
- print("open lmdb fail")
- quit(0)
- def insert_with_label(self, key, value, label):
- try:
- obj = pickle.dumps((value, label))
- print(len(obj)/104857600)
- self.insert(key, obj)
- except Exception as e:
- print(e)
- return False
- return True
- def insert(self, key, value):
- try:
- #key is string, value is byte
- txn = self.env.begin(write=True)
- if isinstance(value, bytes):
- txn.put(key.encode(), value)
- else:
- txn.put(key.encode(), value.encode())
- txn.commit()
- except Exception as e:
- print(e)
- return False
- return True
- def delete(self, key):
- try:
- txn = self.env.begin(write=True)
- txn.delete(key.encode())
- txn.commit()
- except Exception as e:
- print(e)
- def get(self, key):
- # key is str
- txn = self.env.begin()
- name = txn.get(key.encode())
- return name
- def get_with_label(self, key):
- obj = self.get(key)
- value, label = pickle.loads(obj)
- print(type(value), type(label))
- return value, label
- def length(self):
- stat = self.env.stat()
- #{'psize': 4096, 'depth': 1, 'branch_pages': 0, 'leaf_pages': 1, 'overflow_pages': 689200, 'entries': 100}
- return stat["entries"]
- def display(self):
- txn = self.env.begin()
- cur = txn.cursor()
- for key, value in cur:
- print(key.decode("utf-8"))
- value, label = self.get_with_label(key.decode("utf-8"))
- print(label)
- def close(self):
- self.env.close()
- import librosa
- wav_file = "music-jamendo-0039.wav"
- wav_data, sr = librosa.load(wav_file, mono=True)
- db = LMDB(path = "wavdb")
- for i in range(100):
- key = str(i)
- value = wav_data
- label = str(key)
- db.insert_with_label(key, value, label)
- print(db.length())
- db.display()
- db.close()
import lmdb
import os, sys
import pickle
class LMDB:
def __init__(self, path = "dataset"):
self.path = path
self.initialize()
def initialize(self):
self.env = None
try:
self.env = lmdb.open(self.path, map_size = 104857600*40 ) #104857600 100M
except Exception as e:
print(e)
self.env = None
if self.env is None:
print("open lmdb fail")
quit(0)
def insert_with_label(self, key, value, label):
try:
obj = pickle.dumps((value, label))
print(len(obj)/104857600)
self.insert(key, obj)
except Exception as e:
print(e)
return False
return True
def insert(self, key, value):
try:
#key is string, value is byte
txn = self.env.begin(write=True)
if isinstance(value, bytes):
txn.put(key.encode(), value)
else:
txn.put(key.encode(), value.encode())
txn.commit()
except Exception as e:
print(e)
return False
return True
def delete(self, key):
try:
txn = self.env.begin(write=True)
txn.delete(key.encode())
txn.commit()
except Exception as e:
print(e)
def get(self, key):
# key is str
txn = self.env.begin()
name = txn.get(key.encode())
return name
def get_with_label(self, key):
obj = self.get(key)
value, label = pickle.loads(obj)
print(type(value), type(label))
return value, label
def length(self):
stat = self.env.stat()
#{'psize': 4096, 'depth': 1, 'branch_pages': 0, 'leaf_pages': 1, 'overflow_pages': 689200, 'entries': 100}
return stat["entries"]
def display(self):
txn = self.env.begin()
cur = txn.cursor()
for key, value in cur:
print(key.decode("utf-8"))
value, label = self.get_with_label(key.decode("utf-8"))
print(label)
def close(self):
self.env.close()
import librosa
wav_file = "music-jamendo-0039.wav"
wav_data, sr = librosa.load(wav_file, mono=True)
db = LMDB(path = "wavdb")
for i in range(100):
key = str(i)
value = wav_data
label = str(key)
db.insert_with_label(key, value, label)
print(db.length())
db.display()
db.close()
In this example, we will create a lmdb database in wavdb directory.

Then we will save some audio data to it.
In this example, we will save key:(audio_data, label) to lmdb.
db.insert_with_label(key, value, label)
- db.insert_with_label(key, value, label)
db.insert_with_label(key, value, label)
In order to save (audio_data, label), we will use pickle.dumps((value, label)) to pack them to a byte object.
When we read (audio_data, label), we will use pickle.loads(obj) to unpack them.
As to map_size of lmdb, we should notice:
self.env = lmdb.open(self.path, map_size = 104857600*40 ) #104857600 100M
- self.env = lmdb.open(self.path, map_size = 104857600*40 ) #104857600 100M
self.env = lmdb.open(self.path, map_size = 104857600*40 ) #104857600 100M
104857600 == 100M
1099511627776 == 1T
We should set a larger map_size, otherwise, we may get an error: mdb_put: MDB_MAP_FULL: Environment mapsize limit reached