TensorFlow image binary classifier not working efficiently after training

I am experimenting with a binary classifier on images whether it is a bee or not. I have gathered a dataset of 12,000 images of 6 categories, one of which is bees. So I have a column is_bee with values of "0" and "1" matching to "Not a bee" and "It is a bee". I am training the classifier and when I try to apply the trained model on any image, even one of the trained ones, it gives me (almost) exclusively value "0" with 73.11% confidence. My code is as follows:

import pandas as pd
import numpy as np
import sys
import os
import random
from pathlib import Path
import imageio
import skimage
import skimage.transform
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras import optimizers, losses
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten, MaxPool2D, Dropout, BatchNormalization,LeakyReLU
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
import tensorflow

# Global variables

def read_img(file):
    img =
    img = skimage.transform.resize(img, (img_width, img_height), mode='reflect')
    return img[:,:,:img_channels]

images=pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/beeID/images.csv', 
                index_col=False, sep=";",
                dtype={'type':'category', 'is_bee':'category'})
images['file'] = imgs_folder+images['file']
# Cannot impute nans, drop them
# Some image files don't exist. Leave only bees with available images.
img_exists = images['file'].apply(lambda f: os.path.exists(f))
images = images[img_exists]
images['type'] = images['type'].astype('category')
images['is_bee'] = images['is_bee'].astype('category')

def split_balance(df, field_name):
    train_df, test_df = train_test_split(df, random_state=23)
    train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=23)
    ncat_bal = int(len(train_df)/train_df[field_name].cat.categories.size)
    train_df_bal = train_df.groupby(field_name, as_index=False).apply(lambda g: g.sample(ncat_bal, replace=True)).reset_index(drop=True)
    return(train_df_bal, val_df, test_df)

def prepare2train(train_p2t_df, val_p2t_df, test_p2t_df, field_name):
    # Train data
    train_X = np.stack(train_p2t_df['file'].apply(read_img))
    #train_y = to_categorical(train_bees[field_name].values)
    train_y  = pd.get_dummies(train_p2t_df[field_name], drop_first=False)
    # Validation during training data to calc val_loss metric
    val_X = np.stack(val_p2t_df['file'].apply(read_img))
    #val_y = to_categorical(val_bees[field_name].values)
    val_y = pd.get_dummies(val_p2t_df[field_name], drop_first=False)
    # Test data
    test_X = np.stack(test_p2t_df['file'].apply(read_img))
    #test_y = to_categorical(test_bees[field_name].values)
    test_y = pd.get_dummies(test_p2t_df[field_name], drop_first=False)
    # Data augmentation
    generator = ImageDataGenerator(
            featurewise_center=False,  # set input mean to 0 over the dataset
            samplewise_center=False,  # set each sample mean to 0
            featurewise_std_normalization=False,  # divide inputs by std of the dataset
            samplewise_std_normalization=False,  # divide each input by its std
            zca_whitening=False,  # apply ZCA whitening
            rotation_range=180,  # randomly rotate images in the range (degrees, 0 to 180)
            zoom_range=0.1, # Randomly zoom image 
            width_shift_range=0.2,  # randomly shift images horizontally (fraction of total width)
            height_shift_range=0.2,  # randomly shift images vertically (fraction of total height)
            horizontal_flip=True,  # randomly flip images
    return (generator, train_X, val_X, test_X, train_y, val_y, test_y)
def eval_model(training, model, test_X, test_y, field_name):
    ## Trained model analysis and evaluation
    f, ax = plt.subplots(2,1, figsize=(5,5))
    ax[0].plot(training.history['loss'], label="Loss")
    ax[0].plot(training.history['val_loss'], label="Validation loss")
    ax[0].set_title('%s: loss' % field_name)
    # Accuracy
    ax[1].plot(training.history['accuracy'], label="Accuracy")
    ax[1].plot(training.history['val_accuracy'], label="Validation accuracy")
    ax[1].set_title('%s: accuracy' % field_name)
    # Accuracy by subspecies
    test_pred = model.predict(test_X)
    acc_by_subspecies = np.logical_and((test_pred > 0.5), test_y).sum()/test_y.sum()
    acc_by_subspecies.plot(kind='bar', title='Accuracy by %s' % field_name)
    # Print metrics
    print("Classification report")
    test_pred = np.argmax(test_pred, axis=1)
    test_truth = np.argmax(test_y.values, axis=1)
    # Loss function and accuracy
    test_res = model.evaluate(test_X, test_y.values, verbose=1)

# Split/balance and plot the result
train_bees_bal, val_bees, test_bees = split_balance(bees, 'health')
# Split/balance and plot the result
train_images_bal, val_images, test_images = split_balance(images, 'type')

# Will use balanced dataset as main
train_images = train_images_bal
generator_images, train_images_X, val_images_X, test_images_X, train_images_y, val_images_y, test_images_y = prepare2train(train_images, val_images, test_images, 'is_bee')

# We'll stop training if no improvement after some epochs
earlystopper_images = EarlyStopping(monitor='val_accuracy', patience=20, verbose=1)

# Save the best model during the training
checkpointer_images = ModelCheckpoint('model_images.h5'
# Build CNN model
model_images.add(Conv2D(5, kernel_size=3, input_shape=(img_width, img_height,3), activation='relu', padding='same'))
model_images.add(Conv2D(10, kernel_size=3, activation='relu', padding='same'))
model_images.add(Dense(train_images_y.columns.size, activation='sigmoid', name='preds'))
# show model summary

# Train
training_images = model_images.fit_generator(generator_images.flow(train_images_X,train_images_y, batch_size=60)
                        ,validation_data=(val_images_X, val_images_y)
                        ,callbacks=[earlystopper_images, checkpointer_images])
# Get the best saved weights

eval_model(training_images, model_images, test_images_X, test_images_y, 'is_bee')
scores = model_images.evaluate(test_images_X, test_images_y, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

import numpy as np
from google.colab import files
from keras.preprocessing import image
for fn in os.listdir('/content/bee_imgs'):
  # predicting images
  path = '/content/bee_imgs/' + fn
  img = image.load_img(path, target_size=(100, 100))
  x = image.img_to_array(img)
  x = np.expand_dims(x, axis=0)
  checkImages = np.vstack([x])
  classes_images = model_images.predict(checkImages)
  score = tensorflow.nn.softmax(classes_images[0])
    "This image {} most likely belongs to {} with a {:.2f} percent confidence."
    .format(fn, test_images_y.columns[np.argmax(score)], 100 * np.max(score))

The result of the above testing on already trained images (but also occurs when I try it with non trained images) is this:

This image 016_252.png most likely belongs to 0 with a 73.11 percent confidence.
This image 031_117.png most likely belongs to 0 with a 73.11 percent confidence.
This image 019_1026.png most likely belongs to 0 with a 73.11 percent confidence.
This image 008_243.png most likely belongs to 1 with a 73.11 percent confidence.
This image 039_016.png most likely belongs to 0 with a 73.11 percent confidence.
This image 022_364.png most likely belongs to 0 with a 73.11 percent confidence.
This image 022_274.png most likely belongs to 0 with a 73.11 percent confidence.

So, despite the fact that the model's accuracy has reached 96% in predicting whether an image is a bee or not, when I apply it on any image of a bee it seldom recognizes it as a bee. Is there something wrong with my model building or when I am trying to apply it?

After removing the softmax function and np.argmax, you should just use the same read_img function that was used during training for predictions and it should be fine.

