Reputation: 489
I am trying to extend a basic classification model (https://machinelearningmastery.com/handwritten-digit-recognition-using-convolutional-neural-networks-python-keras/) to a simple object detection model for single objects.
The classification model simply classifies handwritten digits in images where the digit fills most of the image. To make a meaningful dataset for the object detection I use the MNIST dataset as base and transform it into a new dataset by the following steps
Figure 1: Illustration of step 1 and 2.
Figure 2: Some produced ground truth bounding boxes.
The output vector from the model is inspired by the YOLO definition but for a single object:
y = [p, x, y, w, h, c0, ..., c9]
where p = probability of an object, (x, y, w, h) = bounding box centre, width and height as fraction of image size, c0-c9 = class probabilities (one for each digit).
So, to change the classification model to an object detection model I simply replaced the last softmax layer to a fully connected layer with 15 nodes (one for each value in y
) and wrote a custom loss function that could compare a prediction to the ground truth.
However, when I try to train the model I get the mysterious error tensorflow.python.framework.errors_impl.InvalidArgumentError: Incompatible shapes: [15] vs. [200]
where [15]
is the number of nodes in my final layer and [200]
is the batch size I specify for training (i verified this by changing the values and running again). They cannot reasonably have to be the same so I guess I have missed something vital when it comes to tensor dimensions in the model, but I cannot figure out what.
Note: My understanding of a batch is how many samples (images) the model processes at once during training. So it is reasonable that the batch size should be an even fraction of the training data size. But there is nothing that should connect it to the number of output nodes in the model.
Any help is appreciated.
Here is the code in its entirety:
import numpy as np
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras import backend as K
def increase_image_size(im_set, new_size):
num_images = im_set.shape[0]
orig_size = im_set[0].shape[0]
im_stack = np.zeros((num_images, new_size, new_size), dtype='uint8')
# Put MNIST digits at random positions in new images
for i in range(num_images):
x0 = int(np.random.random() * (new_size - orig_size - 1))
y0 = int(np.random.random() * (new_size - orig_size - 1))
x1 = x0 + orig_size
y1 = y0 + orig_size
im_stack[i, y0:y1, x0:x1] = im_set[i]
return im_stack
# Get bounding box annotations from images and object labels
def get_image_annotations(X_train, y_train):
num_images = len(X_train)
annotations = np.zeros((num_images, 15), dtype='float')
for i in range(num_images):
annotations[i] = get_image_annotation(X_train[i], y_train[i])
return annotations
def get_image_annotation(X, y):
sz_y, sz_x = X.shape
y_indices, x_indices = np.where(X > 0)
y_min = max(np.min(y_indices) - 1, 0)
y_max = min(np.max(y_indices) + 1, sz_y)
x_min = max(np.min(x_indices) - 1, 0)
x_max = min(np.max(x_indices) + 1, sz_x)
bb_x = (x_min + x_max) / 2.0 / sz_x
bb_y = (y_min + y_max) / 2.0 / sz_y
bb_w = (x_max - x_min) / sz_x
bb_h = (y_max - y_min) / sz_y
classes = np.zeros(10, dtype='float')
classes[y] = 1
output = np.concatenate(([1, bb_x, bb_y, bb_w, bb_h], classes))
return output
def custom_cost_function(y_true, y_pred):
p_p = y_pred[0]
x_p = y_pred[1]
y_p = y_pred[2]
w_p = y_pred[3]
h_p = y_pred[4]
p_t = y_true[0]
x_t = y_true[1]
y_t = y_true[2]
w_t = y_true[3]
h_t = y_true[4]
c_pred = y_pred[5:]
c_true = y_true[5:]
c1 = K.sum((c_pred - c_true) * (c_pred - c_true))
c2 = (x_p - x_t) * (x_p - x_t) + (y_p - y_t) * (y_p - y_t) \
+ (K.sqrt(w_p) - K.sqrt(w_t)) * (K.sqrt(w_p) - K.sqrt(w_t)) \
+ (K.sqrt(h_p) - K.sqrt(h_t)) * (K.sqrt(h_p) - K.sqrt(h_t))
lambda_class = 1.0
lambda_coord = 1.0
return lambda_class * c1 + lambda_coord * c2
def baseline_model():
# create model
model = Sequential()
model.add(Conv2D(32, (5, 5), input_shape=(1, 100, 100), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(15, activation='linear'))
# Compile model
model.compile(loss=custom_cost_function, optimizer='adam', metrics=['accuracy'])
return model
def mnist_object_detection():
K.set_image_dim_ordering('th')
# fix random seed for reproducibility
np.random.seed(7)
# Load data
print("Loading data")
(X_train, y_train), (X_test, y_test) = mnist.load_data()
# Adjust input images
print("Adjust input images (increasing image sizes and moving digits)")
X_train = increase_image_size(X_train, 100)
X_test = increase_image_size(X_test, 100)
print("Creating annotations")
y_train_prim = get_image_annotations(X_train, y_train)
y_test_prim = get_image_annotations(X_test, y_test)
print("...done")
# reshape to be [samples][pixels][width][height]
X_train = X_train.reshape(X_train.shape[0], 1, 100, 100).astype('float32')
X_test = X_test.reshape(X_test.shape[0], 1, 100, 100).astype('float32')
# normalize inputs from 0-255 to 0-1
X_train = X_train / 255
X_test = X_test / 255
# build the model
print("Building model")
model = baseline_model()
# Fit the model
print("Training model")
model.fit(X_train, y_train_prim, validation_data=(X_test, y_test_prim), epochs=10, batch_size=200, verbose=1)
if __name__ == '__main__':
mnist_object_detection()
When I run it I get the error:
/Users/gedda/anaconda3/envs/keras-obj-det/bin/pythonn /Users/gedda/devel/tensorflow/digit-recognition/object_detection_reduced.py
Using TensorFlow backend.
Loading data
Adjust input images (increasing image sizes and moving digits)
Creating annotations
...done
Building model
2018-11-30 13:26:34.030159: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX
2018-11-30 13:26:34.030463: I tensorflow/core/common_runtime/process_util.cc:69] Creating new thread pool with default inter op setting: 8. Tune using inter_op_parallelism_threads for best performance.
Training model
Train on 60000 samples, validate on 10000 samples
Epoch 1/3
Traceback (most recent call last):
File "/Users/gedda/devel/tensorflow/digit-recognition/object_detection_reduced.py", line 140, in <module>
mnist_object_detection()
File "/Users/gedda/devel/tensorflow/digit-recognition/object_detection_reduced.py", line 136, in mnist_object_detection
model.fit(X_train, y_train_prim, validation_data=(X_test, y_test_prim), epochs=3, batch_size=200, verbose=1)
File "/Users/gedda/anaconda3/envs/keras-obj-det/lib/python3.6/site-packages/keras/engine/training.py", line 1039, in fit
validation_steps=validation_steps)
File "/Users/gedda/anaconda3/envs/keras-obj-det/lib/python3.6/site-packages/keras/engine/training_arrays.py", line 199, in fit_loop
outs = f(ins_batch)
File "/Users/gedda/anaconda3/envs/keras-obj-det/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 2715, in __call__
return self._call(inputs)
File "/Users/gedda/anaconda3/envs/keras-obj-det/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 2675, in _call
fetched = self._callable_fn(*array_vals)
File "/Users/gedda/anaconda3/envs/keras-obj-det/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1439, in __call__
run_metadata_ptr)
File "/Users/gedda/anaconda3/envs/keras-obj-det/lib/python3.6/site-packages/tensorflow/python/framework/errors_impl.py", line 528, in __exit__
c_api.TF_GetCode(self.status.status))
tensorflow.python.framework.errors_impl.InvalidArgumentError: Incompatible shapes: [15] vs. [200]
[[{{node training/Adam/gradients/loss/dense_2_loss/mul_7_grad/BroadcastGradientArgs}} = BroadcastGradientArgs[T=DT_INT32, _class=["loc:@training/Adam/gradients/loss/dense_2_loss/mul_7_grad/Reshape"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](training/Adam/gradients/loss/dense_2_loss/mul_7_grad/Shape, training/Adam/gradients/loss/dense_2_loss/mul_7_grad/Shape_1)]]
Process finished with exit code 1
Upvotes: 1
Views: 1900
Reputation: 86600
The first dimension of all tensors is the batch size.
Your loss should probably be working in the second dimension:
def custom_cost_function(y_true, y_pred):
p_p = y_pred[:,0]
x_p = y_pred[:,1]
y_p = y_pred[:,2]
w_p = y_pred[:,3]
h_p = y_pred[:,4]
p_t = y_true[:,0]
x_t = y_true[:,1]
y_t = y_true[:,2]
w_t = y_true[:,3]
h_t = y_true[:,4]
c_pred = y_pred[:,5:]
c_true = y_true[:,5:]
........
Upvotes: 2