Reputation: 25
The following is part of the code, epoch=300, each npz file is 2.73M, but the batch size of my dataloader gives 64, a total of 8 gpuss, so a mini batch should be 64×8×2.73M≈1.1G, my actual memory is 128G. Even if it becomes larger after decompression, it will not reach the size of 128G. The following figure link shows that all 128G of memory is occupied. How should I change my code?
class VimeoDataset(Dataset):
def __init__(self, dataset_name, batch_size=64):
def __init__(self, dataset_name, batch_size=32):
self.batch_size = batch_size
self.path = '/data/train_sample/dataset/'
self.dataset_name = dataset_name
#self.load_data()
self.h = 256
self.w = 448
xx = np.arange(0, self.w).reshape(1,-1).repeat(self.h,0) #xx shape is(256,448)
yy = np.arange(0, self.h).reshape(-1,1).repeat(self.w,1) #yy shape is(448,256)
self.grid = np.stack((xx,yy),2).copy()
def __len__(self):
return len(self.meta_data)
def getimg(self, index):
f = np.load('/data/train_sample/dataset/'+ str(index) + '.npz')
if index < 8000:
train_data = f['i0i1gt']
flow_data = f['ft0ft1']
elif 8000 <= index < 10000:
val_data = f['i0i1gt']
else:
pass
if self.dataset_name == 'train':
meta_data = train_data
else:
meta_data = val_data
data = meta_data
img0 = data[0:3].transpose(1, 2, 0)
img1 = data[3:6].transpose(1, 2, 0)
gt = data[6:9].transpose(1, 2, 0)
flow_gt = flow_data.transpose(1, 2, 0)
return img0, gt, img1, flow_gt
dataset = VimeoDataset('train')
def __getitem__(self, index):
img0, gt, img1, flow_gt = self.getimg(index)
...
sampler = DistributedSampler(dataset)
train_data = DataLoader(dataset, batch_size=args.batch_size, num_workers=8, pin_memory=True,
drop_last=True, sampler=sampler)
Upvotes: 0
Views: 221
Reputation: 3283
I have given fixing your dataset given our comments above a go. Essentially you need to pass more variables into your class so that it can easily differentiate between your train and validation data. This is without loading all of your data into memory, although sometimes this is necessary (sequentially, not at once) to calculate some data statistics and such.
Disclaimer: I took a guess at using glob to find your npz files and that you use flow_data in your validation set (missing in your code for validation data).
from glob import glob
class VimeoDataset(Dataset):
def __init__(self, npzs, batch_size=64,train_set=False):
self.batch_size = batch_size
self.train_set = train_set
self.h = 256
self.w = 448
xx = np.arange(0, self.w).reshape(1,-1).repeat(self.h,0) #xx shape is(256,448)
yy = np.arange(0, self.h).reshape(-1,1).repeat(self.w,1) #yy shape is(448,256)
self.grid = np.stack((xx,yy),2).copy()
self.npzs = npzs
def __len__(self):
return len(self.npzs)
def getimg(self, index):
f = np.load(self.npzs[index])
data = f['i0i1gt']
if self.train_set:
flow_data = f['ft0ft1']
else:
flow_data = np.zeros([self.h,self.w,4])
img0 = data[0:3].transpose(1, 2, 0)
img1 = data[3:6].transpose(1, 2, 0)
gt = data[6:9].transpose(1, 2, 0)
flow_gt = flow_data.transpose(1, 2, 0)
return img0, gt, img1, flow_gt
def __getitem__(self, index):
img0, gt, img1, flow_gt = self.getimg(index)
npzs = glob('/data/train_sample/dataset/*.npz')
train_val_split = 8000
train_dataset = VimeoDataset(npzs[:train_val_split],train_set = True)
val_dataset = VimeoDataset(npzs[train_val_split:])
Upvotes: 1