Dataset size after data augmentation

After I used the data augmentation, such like

train = ImageDataGenerator(
        rotation_range=90,    
        width_shift_range=0.2,  
        height_shift_range=0.2,
        zca_whitening = True,
        shear_range=0.2)

test = ImageDataGenerator(rescale=1./255)

train_dataset = train.flow_from_directory("/content/drive/MyDrive/dataset",
                                          target_size=(150, 150),
                                          batch_size = 32,
                                          class_mode = 'binary')

Output:

Found 327 images belonging to 2 classes.

Since 327 images are the original data size, is my training dataset increased after the data augmentation? If it is, how should I get the new dataset size?

Upvotes: 1

Answers (1)

Gerry P

Reputation: 8092

yes you can increase the size by creating and saving augmented images for each class then merging those images with the original trainset. Below is a function I use for that called balance. You input into it a dataframe thathas columns filepaths and labels. The filepath column contains the full file path to the training images. labels is the corresponding class label for the image. The working direcory is the directory where the augmented images are stored. img_size is the size of the augmented images.

def balance(df, n, working_dir, img_size):
    def augment(df,n, working_dir, img_size):
        aug_dir=os.path.join(working_dir, 'aug')
        os.mkdir(aug_dir)        
        for label in df['labels'].unique():    
            dir_path=os.path.join(aug_dir,label)    
            os.mkdir(dir_path)
        # create and store the augmented images  
        total=0
        gen=ImageDataGenerator(horizontal_flip=True,  rotation_range=20, width_shift_range=.2,
                                      height_shift_range=.2, zoom_range=.2)
        groups=df.groupby('labels') # group by class
        for label in df['labels'].unique():  # for every class               
            group=groups.get_group(label)  # a dataframe holding only rows with the specified label 
            sample_count=len(group)   # determine how many samples there are in this class  
            if sample_count< n: # if the class has less than target number of images
                aug_img_count=0
                delta=n - sample_count  # number of augmented images to create
                target_dir=os.path.join(aug_dir, label)  # define where to write the images
                msg='{0:40s} for class {1:^30s} creating {2:^5s} augmented images'.format(' ', label, str(delta))
                print(msg, '\r', end='') # prints over on the same line
                aug_gen=gen.flow_from_dataframe( group,  x_col='filepaths', y_col=None, target_size=img_size,
                                                class_mode=None, batch_size=1, shuffle=False, 
                                                save_to_dir=target_dir, save_prefix='aug-', color_mode='rgb',
                                                save_format='jpg')
                while aug_img_count<delta:
                    images=next(aug_gen)            
                    aug_img_count += len(images)
                total +=aug_img_count
        print('Total Augmented images created= ', total)
        # create aug_df and merge with train_df to create composite training set ndf
        aug_fpaths=[]
        aug_labels=[]
        classlist=os.listdir(aug_dir)
        for klass in classlist:
            classpath=os.path.join(aug_dir, klass)     
            flist=os.listdir(classpath)    
            for f in flist:        
                fpath=os.path.join(classpath,f)         
                aug_fpaths.append(fpath)
                aug_labels.append(klass)
        Fseries=pd.Series(aug_fpaths, name='filepaths')
        Lseries=pd.Series(aug_labels, name='labels')
        aug_df=pd.concat([Fseries, Lseries], axis=1)         
        df=pd.concat([df,aug_df], axis=0).reset_index(drop=True)        
        return df 
    
    df=df.copy() 
    # make directories to store augmented images
    aug_dir=os.path.join(working_dir, 'aug')    
    if 'aug' in os.listdir(working_dir):
        print(' Augmented images already exist. To delete these and create new images enter D, else enter U to use these images', flush=True)
        ans=input(' ')
        if ans == 'D' or ans == 'd':            
            shutil.rmtree(aug_dir) # start with an clean empty directory  
            return augment(df,n, working_dir, img_size)
            
        else:
            return df
    else:
        return augment(df,n, working_dir, img_size)
        
        
   
n=120# number of samples in each class
working_dir=r'./' # directory to store augmented images
img_size=(166,208) # size of augmented images
train_df=balance(train_df, n, working_dir, img_size)

Upvotes: 1

Dataset size after data augmentation

Answers (1)

Related Questions