Reputation: 77
After I used the data augmentation, such like
train = ImageDataGenerator(
rotation_range=90,
width_shift_range=0.2,
height_shift_range=0.2,
zca_whitening = True,
shear_range=0.2)
test = ImageDataGenerator(rescale=1./255)
train_dataset = train.flow_from_directory("/content/drive/MyDrive/dataset",
target_size=(150, 150),
batch_size = 32,
class_mode = 'binary')
Output:
Found 327 images belonging to 2 classes.
Since 327 images are the original data size, is my training dataset increased after the data augmentation? If it is, how should I get the new dataset size?
Upvotes: 1
Views: 2877
Reputation: 8092
yes you can increase the size by creating and saving augmented images for each class then merging those images with the original trainset. Below is a function I use for that called balance. You input into it a dataframe thathas columns filepaths and labels. The filepath column contains the full file path to the training images. labels is the corresponding class label for the image. The working direcory is the directory where the augmented images are stored. img_size is the size of the augmented images.
def balance(df, n, working_dir, img_size):
def augment(df,n, working_dir, img_size):
aug_dir=os.path.join(working_dir, 'aug')
os.mkdir(aug_dir)
for label in df['labels'].unique():
dir_path=os.path.join(aug_dir,label)
os.mkdir(dir_path)
# create and store the augmented images
total=0
gen=ImageDataGenerator(horizontal_flip=True, rotation_range=20, width_shift_range=.2,
height_shift_range=.2, zoom_range=.2)
groups=df.groupby('labels') # group by class
for label in df['labels'].unique(): # for every class
group=groups.get_group(label) # a dataframe holding only rows with the specified label
sample_count=len(group) # determine how many samples there are in this class
if sample_count< n: # if the class has less than target number of images
aug_img_count=0
delta=n - sample_count # number of augmented images to create
target_dir=os.path.join(aug_dir, label) # define where to write the images
msg='{0:40s} for class {1:^30s} creating {2:^5s} augmented images'.format(' ', label, str(delta))
print(msg, '\r', end='') # prints over on the same line
aug_gen=gen.flow_from_dataframe( group, x_col='filepaths', y_col=None, target_size=img_size,
class_mode=None, batch_size=1, shuffle=False,
save_to_dir=target_dir, save_prefix='aug-', color_mode='rgb',
save_format='jpg')
while aug_img_count<delta:
images=next(aug_gen)
aug_img_count += len(images)
total +=aug_img_count
print('Total Augmented images created= ', total)
# create aug_df and merge with train_df to create composite training set ndf
aug_fpaths=[]
aug_labels=[]
classlist=os.listdir(aug_dir)
for klass in classlist:
classpath=os.path.join(aug_dir, klass)
flist=os.listdir(classpath)
for f in flist:
fpath=os.path.join(classpath,f)
aug_fpaths.append(fpath)
aug_labels.append(klass)
Fseries=pd.Series(aug_fpaths, name='filepaths')
Lseries=pd.Series(aug_labels, name='labels')
aug_df=pd.concat([Fseries, Lseries], axis=1)
df=pd.concat([df,aug_df], axis=0).reset_index(drop=True)
return df
df=df.copy()
# make directories to store augmented images
aug_dir=os.path.join(working_dir, 'aug')
if 'aug' in os.listdir(working_dir):
print(' Augmented images already exist. To delete these and create new images enter D, else enter U to use these images', flush=True)
ans=input(' ')
if ans == 'D' or ans == 'd':
shutil.rmtree(aug_dir) # start with an clean empty directory
return augment(df,n, working_dir, img_size)
else:
return df
else:
return augment(df,n, working_dir, img_size)
n=120# number of samples in each class
working_dir=r'./' # directory to store augmented images
img_size=(166,208) # size of augmented images
train_df=balance(train_df, n, working_dir, img_size)
Upvotes: 1