Testing accuracy higher than training accuracy

Question

Why is the testing accuracy higher than my training accuracy? This is not the case for the validation accuracy. Is it because of the way I am splitting my dataset?

Modifying the network did not work so I am guessing I am doing something wrong in the dataset preparation part.

The dataset is composed of packet captures of malware and normal activities.. dataset.txt file contains total of 777 rows and 28 columns.

#converting dataset and labels to numpy arrays
x = np.genfromtxt("dataset.txt", delimiter=",")
y = np.genfromtxt("label.txt", delimiter=",")

#handling missing values
x[np.isnan(x)] = 0

#shuffling the data
indices = np.arange(x.shape[0])
np.random.shuffle(indices)
x = x[indices]
y = y[indices]

#dividing the dataset into train and test 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

#building the model
def build_model():
        model = models.Sequential()
        model.add(layers.Dense(32, activation='relu', input_shape=(28,)))
        model.add(layers.Dense(32, activation='relu'))
        model.add(layers.Dense(32, activation='relu'))
        model.add(Dropout(0.2))
        model.add(layers.Dense(1, activation='sigmoid'))
        model.compile(optimizer='rmsprop',  loss='binary_crossentropy', 
                      metrics=['accuracy'])
        return model

'''cross validation 
k = 5
num_val_samples = len(x_train) // k
all_scores = []

for i in range(k):
   print('fold #', i)
   x_val = x_train[i * num_val_samples: (i + 1) * num_val_samples]
   y_val = y_train[i * num_val_samples: (i + 1) * num_val_samples]
   partial_x_train = np.concatenate([x_train[:i * num_val_samples], 
                     x_train[(i + 1) * num_val_samples:]], axis=0)
   partial_y_train = np.concatenate([y_train[:i * num_val_samples], 
                     y_train[(i + 1) * num_val_samples:]], axis=0)
   model = build_model()
   model.fit(partial_x_train, partial_y_train,epochs=20, batch_size=16, 
             verbose=0)
   val_loss, val_acc = model.evaluate(x_val, y_val, verbose=0)
   all_scores.append(val_acc)

print(all_scores)
val_acc = np.mean(all_scores)
print(val_loss , val_acc) 
'''

#training the model with the entire training dataset
model = build_model()
model.fit(x_train, y_train, epochs=20, batch_size=16)

#confusion matrix
y_pred = model.predict(x_test)
y_pred = (y_pred > 0.5)
result = confusion_matrix(y_test, y_pred)
print ('Confusion Matrix:')
print(result)

#calculating the test accuracy
model_acc = accuracy_score(y_test, y_pred)
print('Test Accuracy:')
print(model_acc)

Testing accuracy higher than training accuracy

Answers (1)

Related Questions