dachun
dachun

Reputation: 25

Dataset size is smaller than memory, What's wrong with my code?

The following is part of the code, epoch=300, each npz file is 2.73M, but the batch size of my dataloader gives 64, a total of 8 gpuss, so a mini batch should be 64×8×2.73M≈1.1G, my actual memory is 128G. Even if it becomes larger after decompression, it will not reach the size of 128G. The following figure link shows that all 128G of memory is occupied. How should I change my code?

class VimeoDataset(Dataset):
def __init__(self, dataset_name, batch_size=64):
    def __init__(self, dataset_name, batch_size=32):
    self.batch_size = batch_size
    self.path = '/data/train_sample/dataset/'
    self.dataset_name = dataset_name
    #self.load_data()
    self.h = 256
    self.w = 448
    xx = np.arange(0, self.w).reshape(1,-1).repeat(self.h,0) #xx shape is(256,448)
    yy = np.arange(0, self.h).reshape(-1,1).repeat(self.w,1) #yy shape is(448,256)
    self.grid = np.stack((xx,yy),2).copy()

def __len__(self):
    return len(self.meta_data)

def getimg(self, index):
    f = np.load('/data/train_sample/dataset/'+ str(index) + '.npz')
    if index < 8000:
        train_data = f['i0i1gt']
        flow_data = f['ft0ft1'] 
    elif 8000 <= index < 10000:
        val_data = f['i0i1gt']
    else:
        pass

    if self.dataset_name == 'train':
        meta_data = train_data
    else:
        meta_data = val_data

    data = meta_data
    img0 = data[0:3].transpose(1, 2, 0)
    img1 = data[3:6].transpose(1, 2, 0)
    gt = data[6:9].transpose(1, 2, 0)
    flow_gt = flow_data.transpose(1, 2, 0)
    return img0, gt, img1, flow_gt  

    dataset = VimeoDataset('train')
def __getitem__(self, index):        
    img0, gt, img1, flow_gt = self.getimg(index)
    ...

sampler = DistributedSampler(dataset) 
train_data = DataLoader(dataset, batch_size=args.batch_size, num_workers=8, pin_memory=True, 
drop_last=True, sampler=sampler)

system usage figure

Upvotes: 0

Views: 221

Answers (1)

jhso
jhso

Reputation: 3283

I have given fixing your dataset given our comments above a go. Essentially you need to pass more variables into your class so that it can easily differentiate between your train and validation data. This is without loading all of your data into memory, although sometimes this is necessary (sequentially, not at once) to calculate some data statistics and such.

Disclaimer: I took a guess at using glob to find your npz files and that you use flow_data in your validation set (missing in your code for validation data).

from glob import glob
class VimeoDataset(Dataset):
    def __init__(self, npzs, batch_size=64,train_set=False):
        self.batch_size = batch_size
        self.train_set = train_set
        self.h = 256
        self.w = 448
        xx = np.arange(0, self.w).reshape(1,-1).repeat(self.h,0) #xx shape is(256,448)
        yy = np.arange(0, self.h).reshape(-1,1).repeat(self.w,1) #yy shape is(448,256)
        self.grid = np.stack((xx,yy),2).copy()
        self.npzs = npzs        

    def __len__(self):
        return len(self.npzs)
    
    def getimg(self, index):
        f = np.load(self.npzs[index])
        data = f['i0i1gt']
        if self.train_set:
            flow_data = f['ft0ft1']
        else: 
            flow_data = np.zeros([self.h,self.w,4]) 
        img0 = data[0:3].transpose(1, 2, 0)
        img1 = data[3:6].transpose(1, 2, 0)
        gt = data[6:9].transpose(1, 2, 0)
        flow_gt = flow_data.transpose(1, 2, 0)
        return img0, gt, img1, flow_gt  
    
    def __getitem__(self, index):        
        img0, gt, img1, flow_gt = self.getimg(index)

npzs = glob('/data/train_sample/dataset/*.npz')
train_val_split = 8000
train_dataset = VimeoDataset(npzs[:train_val_split],train_set = True)
val_dataset = VimeoDataset(npzs[train_val_split:])

Upvotes: 1

Related Questions