Reputation: 163
Im looking for a optimized solution to load multiple huge .npy files using pytorch data loader. I'm currently using the following method which creates a new dataloader for each file in each epoch.
My data loader is something like:
class GetData(torch.utils.data.Dataset):
def __init__(self, data_path, target_path, transform=None):
with open(data_path, 'rb') as train_pkl_file:
data = pickle.load(train_pkl_file)
self.data = torch.from_numpy(data).float()
with open(target_path, 'rb') as target_pkl_file:
targets = pickle.load(target_pkl_file)
self.targets = torch.from_numpy(targets).float()
def __getitem__(self, index):
x = self.data[index]
y = self.targets[index]
return index, x, y
def __len__(self):
num_images = self.data.shape[0]
return num_images
I have a npy list of files:
list1 = ['d1.npy', 'd2.npy','d3.npy']
list1 = ['s1.npy', 's2.npy','s3.npy']
I have created a dataloader which gives the filenames
class MyDataset(torch.utils.data.Dataset):
def __init__(self,flist):
self.npy_list1 = flist1
self.npy_list2 = flist2
def __getitem__(self, idx):
filename1 = self.npy_list1[idx]
filename2 = self.npy_list2[idx]
return filename1,filename2
def __len__(self):
return len(self.npy_list1)
And I itreate through them as follows:
for epoch in range(500):
print('Epoch #%s' % epoch)
model.train()
loss_, elbo_, recon_ = [[] for _ in range(3)]
running_loss = 0
# FOR EVERY SMALL FILE
print("Training: ")
# TRAIN HERE
my_dataset = MyDataset(npyList)
for idx, (dynamic_file, static_file) in tqdm(enumerate(my_dataset)):
...Do stuff ....
The above method works but i'm looking for more memory efficient solution. Note: I have huge amount of data > 200 GB so concatenating the numpy arrays into 1 file may not be the solution (due to RAM limitations). Thanks in advance
Upvotes: 5
Views: 4067
Reputation: 1206
According to numpy.load, you can set the argument mmap_mode='r'
to receive a memory-mapped array numpy.memmap.
A memory-mapped array is kept on disk. However, it can be accessed and sliced like any ndarray. Memory mapping is especially useful for accessing small fragments of large files without reading the entire file into memory.
I tried implementing a dataset that use memory maps. First, I generated some data as follows:
import numpy as np
feature_size = 16
total_count = 0
for index in range(10):
count = 1000 * (index + 1)
D = np.random.rand(count, feature_size).astype(np.float32)
S = np.random.rand(count, 1).astype(np.float32)
np.save(f'data/d{index}.npy', D)
np.save(f'data/s{index}.npy', S)
total_count += count
print("Dataset size:", total_count)
print("Total bytes:", total_count * (feature_size + 1) * 4, "bytes")
The output was:
Dataset size: 55000
Total bytes: 3740000 bytes
Then, my implementation of the dataset is as follows:
import numpy as np
import torch
from bisect import bisect
import os, psutil # used to monitor memory usage
class BigDataset(torch.utils.data.Dataset):
def __init__(self, data_paths, target_paths):
self.data_memmaps = [np.load(path, mmap_mode='r') for path in data_paths]
self.target_memmaps = [np.load(path, mmap_mode='r') for path in target_paths]
self.start_indices = [0] * len(data_paths)
self.data_count = 0
for index, memmap in enumerate(self.data_memmaps):
self.start_indices[index] = self.data_count
self.data_count += memmap.shape[0]
def __len__(self):
return self.data_count
def __getitem__(self, index):
memmap_index = bisect(self.start_indices, index) - 1
index_in_memmap = index - self.start_indices[memmap_index]
data = self.data_memmaps[memmap_index][index_in_memmap]
target = self.target_memmaps[memmap_index][index_in_memmap]
return index, torch.from_numpy(data), torch.from_numpy(target)
# Test Code
if __name__ == "__main__":
data_paths = [f'data/d{index}.npy' for index in range(10)]
target_paths = [f'data/s{index}.npy' for index in range(10)]
process = psutil.Process(os.getpid())
memory_before = process.memory_info().rss
dataset = BigDataset(data_paths, target_paths)
used_memory = process.memory_info().rss - memory_before
print("Used memory:", used_memory, "bytes")
dataset_size = len(dataset)
print("Dataset size:", dataset_size)
print("Samples:")
for sample_index in [0, dataset_size//2, dataset_size-1]:
print(dataset[sample_index])
The output was as follows:
Used memory: 299008 bytes
Dataset size: 55000
Samples:
(0, tensor([0.5240, 0.2931, 0.9039, 0.9467, 0.8710, 0.2147, 0.4928, 0.8309, 0.7344, 0.2861, 0.1557, 0.7009, 0.1624, 0.8608, 0.5378, 0.4304]), tensor([0.7725]))
(27500, tensor([0.8109, 0.3794, 0.6377, 0.4825, 0.2959, 0.6325, 0.7278, 0.6856, 0.1037, 0.3443, 0.2469, 0.4317, 0.6690, 0.4543, 0.7007, 0.5733]), tensor([0.7856]))
(54999, tensor([0.4013, 0.9990, 0.9107, 0.9897, 0.0204, 0.2776, 0.5529, 0.5752, 0.2266, 0.9352, 0.2130, 0.9542, 0.4116, 0.4959, 0.1436, 0.9840]), tensor([0.6342]))
According to the results, the memory usage is only 10% from the total size. I didn't try my code with very large file sizes so I don't know how efficient it will be with >200 GB of files. If you can try it and tell me the memory usage with and without memmaps, I would be grateful.
Upvotes: 9