Reputation: 10033
I am failing at training a VGG NET
model from scratch. Please allow me to describe my steps so far:
From two folders contaning my training
and validation
images, labeled
as training_list.txt
and validation_list.txt
, generated data.mdb
files. After inspection these mdb
files show valid img
data and correct labels.
Generated a mean_training_image.binaryproto
file.
Uploaded the lmdb
dataset to Floydhub
cloud, to train it
using GPU
, with
floyd run --gpu --env caffe:py2 --data patalanov/datasets/vgg-my-face:input 'caffe train -solver models/Custom_Model/solver.prototxt'
Downloaded file _iter_3000.caffemodel
.
This is what my net prints:
blobs ['data', 'conv1', 'norm1', 'pool1', 'conv2', 'pool2', 'conv3', 'conv4', 'conv5', 'pool5', 'fc6', 'fc7', 'fc8', 'prob']
params ['conv1', 'conv2', 'conv3', 'conv4', 'conv5', 'fc6', 'fc7', 'fc8_cat']
Layer Name : conv1, Weight Dims :(96, 3, 7, 7)
Layer Name : conv2, Weight Dims :(256, 96, 5, 5)
Layer Name : conv3, Weight Dims :(512, 256, 3, 3)
Layer Name : conv4, Weight Dims :(512, 512, 3, 3)
Layer Name : conv5, Weight Dims :(512, 512, 3, 3)
Layer Name : fc6, Weight Dims :(4048, 25088)
Layer Name : fc7, Weight Dims :(4048, 4048)
Layer Name : fc8_cat, Weight Dims :(6, 4048)
fc6 weights are (4048, 25088) dimensional and biases are (4048,) dimensional
fc7 weights are (4048, 4048) dimensional and biases are (4048,) dimensional
fc8_cat weights are (6, 4048) dimensional and biases are (6,) dimensional
Something is wrong, however, because Loss
is not being reduced during training. Log:
Iteration 2980 (6.05491 iter/s, 1.65155s/10 iters), loss = 1.79258
Iteration 2980, lr = 0.001
Iteration 2990 (6.28537 iter/s, 1.591s/10 iters), loss = 1.79471
Iteration 2990, lr = 0.001
Snapshotting to binary proto file /output/_iter_3000.caffemodel
Snapshotting solver state to binary proto file /output/_iter_3000.solverstate
Iteration 3000, loss = 1.7902
Iteration 3000, Testing net (#0)
Ignoring source layer training_train
Optimization Done.
Optimization Done.
It seems that initialized weights are not being propagated through the network, because from conv1
onwards, weights are all zero.
...,
[[[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.]],
[[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.]],
[[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.]],
...,
train.prototxt
name: "CaffeNet"
layers {
name: "training_train"
type: DATA
data_param {
source: "/input/training_set_lmdb"
backend: LMDB
batch_size: 30
}
transform_param{
mean_file: "/input/mean_training_image.binaryproto"
}
top: "data"
top: "label"
include {
phase: TRAIN
}
}
layers {
name: "training_test"
type: DATA
data_param {
source: "/input/validation_set_lmdb"
backend: LMDB
batch_size: 15
}
transform_param{
mean_file: "/input/mean_training_image.binaryproto"
}
top: "data"
top: "label"
include {
phase: TEST
}
}
layers {
name: "conv1"
type: CONVOLUTION
bottom: "data"
top: "conv1"
convolution_param {
num_output: 96
kernel_size: 7
stride: 2
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0
}
}
blobs_lr: 0
blobs_lr: 0
}
layers {
name: "relu1"
type: RELU
bottom: "conv1"
top: "conv1"
}
layers {
name: "norm1"
type: LRN
bottom: "conv1"
top: "norm1"
lrn_param {
local_size: 5
alpha: 0.0005
beta: 0.75
}
}
layers {
name: "pool1"
type: POOLING
bottom: "norm1"
top: "pool1"
pooling_param {
pool: MAX
kernel_size: 3
stride: 3
}
}
layers {
name: "conv2"
type: CONVOLUTION
bottom: "pool1"
top: "conv2"
convolution_param {
num_output: 256
pad: 2
kernel_size: 5
}
blobs_lr: 0
blobs_lr: 0
}
layers {
name: "relu2"
type: RELU
bottom: "conv2"
top: "conv2"
}
layers {
name: "pool2"
type: POOLING
bottom: "conv2"
top: "pool2"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layers {
name: "conv3"
type: CONVOLUTION
bottom: "pool2"
top: "conv3"
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
}
blobs_lr: 0
blobs_lr: 0
}
layers {
name: "relu3"
type: RELU
bottom: "conv3"
top: "conv3"
}
layers {
name: "conv4"
type: CONVOLUTION
bottom: "conv3"
top: "conv4"
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
}
blobs_lr: 0
blobs_lr: 0
}
layers {
name: "relu4"
type: RELU
bottom: "conv4"
top: "conv4"
}
layers {
name: "conv5"
type: CONVOLUTION
bottom: "conv4"
top: "conv5"
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
}
blobs_lr: 0
blobs_lr: 0
}
layers {
name: "relu5"
type: RELU
bottom: "conv5"
top: "conv5"
}
layers {
name: "pool5"
type: POOLING
bottom: "conv5"
top: "pool5"
pooling_param {
pool: MAX
kernel_size: 3
stride: 3
}
}
layers {
name: "fc6"
type: INNER_PRODUCT
bottom: "pool5"
top: "fc6"
inner_product_param {
num_output: 4048
}
blobs_lr: 1.0
blobs_lr: 1.0
}
layers {
name: "relu6"
type: RELU
bottom: "fc6"
top: "fc6"
}
layers {
name: "drop6"
type: DROPOUT
bottom: "fc6"
top: "fc6"
dropout_param {
dropout_ratio: 0.5
}
}
layers {
name: "fc7"
type: INNER_PRODUCT
bottom: "fc6"
top: "fc7"
inner_product_param {
num_output: 4048
}
blobs_lr: 1.0
blobs_lr: 1.0
}
layers {
name: "relu7"
type: RELU
bottom: "fc7"
top: "fc7"
}
layers {
name: "drop7"
type: DROPOUT
bottom: "fc7"
top: "fc7"
dropout_param {
dropout_ratio: 0.5
}
}
layers {
name: "fc8_cat"
type: INNER_PRODUCT
bottom: "fc7"
top: "fc8"
inner_product_param {
num_output: 6
}
blobs_lr: 1.0
blobs_lr: 1.0
}
layers {
name: "prob"
type: SOFTMAX_LOSS
bottom: "fc8"
bottom: "label"
}
deploy.prototxt
name: "VGG_FACE_16_layers"
input: "data"
input_dim: 1
input_dim: 3
input_dim: 224
input_dim: 224
layers {
name: "conv1"
type: CONVOLUTION
bottom: "data"
top: "conv1"
convolution_param {
num_output: 96
kernel_size: 7
stride: 2
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0
}
}
}
layers {
name: "relu1"
type: RELU
bottom: "conv1"
top: "conv1"
}
layers {
name: "norm1"
type: LRN
bottom: "conv1"
top: "norm1"
lrn_param {
local_size: 5
alpha: 0.0005
beta: 0.75
}
}
layers {
name: "pool1"
type: POOLING
bottom: "norm1"
top: "pool1"
pooling_param {
pool: MAX
kernel_size: 3
stride: 3
}
}
layers {
name: "conv2"
type: CONVOLUTION
bottom: "pool1"
top: "conv2"
convolution_param {
num_output: 256
pad: 2
kernel_size: 5
}
}
layers {
name: "relu2"
type: RELU
bottom: "conv2"
top: "conv2"
}
layers {
name: "pool2"
type: POOLING
bottom: "conv2"
top: "pool2"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layers {
name: "conv3"
type: CONVOLUTION
bottom: "pool2"
top: "conv3"
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
}
}
layers {
name: "relu3"
type: RELU
bottom: "conv3"
top: "conv3"
}
layers {
name: "conv4"
type: CONVOLUTION
bottom: "conv3"
top: "conv4"
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
}
}
layers {
name: "relu4"
type: RELU
bottom: "conv4"
top: "conv4"
}
layers {
name: "conv5"
type: CONVOLUTION
bottom: "conv4"
top: "conv5"
convolution_param {
num_output: 512
pad: 1
kernel_size: 3
}
}
layers {
name: "relu5"
type: RELU
bottom: "conv5"
top: "conv5"
}
layers {
name: "pool5"
type: POOLING
bottom: "conv5"
top: "pool5"
pooling_param {
pool: MAX
kernel_size: 3
stride: 3
}
}
layers {
name: "fc6"
type: INNER_PRODUCT
bottom: "pool5"
top: "fc6"
inner_product_param {
num_output: 4048
}
}
layers {
name: "relu6"
type: RELU
bottom: "fc6"
top: "fc6"
}
layers {
name: "drop6"
type: DROPOUT
bottom: "fc6"
top: "fc6"
dropout_param {
dropout_ratio: 0.5
}
}
layers {
name: "fc7"
type: INNER_PRODUCT
bottom: "fc6"
top: "fc7"
inner_product_param {
num_output: 4048
}
}
layers {
name: "relu7"
type: RELU
bottom: "fc7"
top: "fc7"
}
layers {
name: "drop7"
type: DROPOUT
bottom: "fc7"
top: "fc7"
dropout_param {
dropout_ratio: 0.5
}
}
layers {
name: "fc8_cat"
type: INNER_PRODUCT
bottom: "fc7"
top: "fc8"
inner_product_param {
num_output: 6
}
}
layers {
name: "prob"
type: SOFTMAX
bottom: "fc8"
top: "prob"
}
solver.prototxt
net: "models/Custom_Model/train.prototxt"
# test_iter specifies how many forward passes the test should carry out
test_iter: 1
# Carry out testing every X training iterations
test_interval: 20
# Learning rate and momentum parameters for Adam
base_lr: 0.001
momentum: 0.9
momentum2: 0.999
# Adam takes care of changing the learning rate
lr_policy: "fixed"
# Display every X iterations
display: 10
# The maximum number of iterations
max_iter: 3000
# snapshot intermediate results
snapshot: 1000
snapshot_prefix: "/output/"
# solver mode: CPU or GPU
type: "Adam"
solver_mode: GPU
What am I missing here?
EDIT LOG: first 10 iterations:
2018-06-24 13:45:30 PSTI0624 20:45:30.842751 22 solver.cpp:218] Iteration 0 (0 iter/s, 0.345767s/10 iters), loss = 1.79176
2018-06-24 13:45:30 PSTI0624 20:45:30.842778 22 sgd_solver.cpp:105] Iteration 0, lr = 0.001
2018-06-24 13:45:32 PSTI0624 20:45:32.362357 22 net.cpp:591] [Forward] Layer training_train, top blob data data: 32.8544
2018-06-24 13:45:32 PSTI0624 20:45:32.362499 22 net.cpp:591] [Forward] Layer training_train, top blob label data: 2.46875
2018-06-24 13:45:32 PSTI0624 20:45:32.373751 22 net.cpp:591] [Forward] Layer conv1, top blob conv1 data: 3.80577
2018-06-24 13:45:32 PSTI0624 20:45:32.373879 22 net.cpp:603] [Forward] Layer conv1, param blob 0 data: 0.00792726
2018-06-24 13:45:32 PSTI0624 20:45:32.375567 22 net.cpp:603] [Forward] Layer conv1, param blob 1 data: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.379498 22 net.cpp:591] [Forward] Layer relu1, top blob conv1 data: 1.88893
2018-06-24 13:45:32 PSTI0624 20:45:32.382942 22 net.cpp:591] [Forward] Layer norm1, top blob norm1 data: 1.86441
2018-06-24 13:45:32 PSTI0624 20:45:32.384709 22 net.cpp:591] [Forward] Layer pool1, top blob pool1 data: 2.64384
2018-06-24 13:45:32 PSTI0624 20:45:32.407202 22 net.cpp:591] [Forward] Layer conv2, top blob conv2 data: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.407317 22 net.cpp:603] [Forward] Layer conv2, param blob 0 data: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.407389 22 net.cpp:603] [Forward] Layer conv2, param blob 1 data: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.408679 22 net.cpp:591] [Forward] Layer relu2, top blob conv2 data: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.409492 22 net.cpp:591] [Forward] Layer pool2, top blob pool2 data: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.422981 22 net.cpp:591] [Forward] Layer conv3, top blob conv3 data: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.423092 22 net.cpp:603] [Forward] Layer conv3, param blob 0 data: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.423151 22 net.cpp:603] [Forward] Layer conv3, param blob 1 data: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.423795 22 net.cpp:591] [Forward] Layer relu3, top blob conv3 data: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.451364 22 net.cpp:591] [Forward] Layer conv4, top blob conv4 data: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.451510 22 net.cpp:603] [Forward] Layer conv4, param blob 0 data: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.451575 22 net.cpp:603] [Forward] Layer conv4, param blob 1 data: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.452227 22 net.cpp:591] [Forward] Layer relu4, top blob conv4 data: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.479389 22 net.cpp:591] [Forward] Layer conv5, top blob conv5 data: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.479573 22 net.cpp:603] [Forward] Layer conv5, param blob 0 data: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.479640 22 net.cpp:603] [Forward] Layer conv5, param blob 1 data: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.480298 22 net.cpp:591] [Forward] Layer relu5, top blob conv5 data: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.480830 22 net.cpp:591] [Forward] Layer pool5, top blob pool5 data: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.487016 22 net.cpp:591] [Forward] Layer fc6, top blob fc6 data: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.490016 22 net.cpp:603] [Forward] Layer fc6, param blob 0 data: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.490097 22 net.cpp:603] [Forward] Layer fc6, param blob 1 data: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.490191 22 net.cpp:591] [Forward] Layer relu6, top blob fc6 data: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.490295 22 net.cpp:591] [Forward] Layer drop6, top blob fc6 data: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.491454 22 net.cpp:591] [Forward] Layer fc7, top blob fc7 data: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.492004 22 net.cpp:603] [Forward] Layer fc7, param blob 0 data: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.492074 22 net.cpp:603] [Forward] Layer fc7, param blob 1 data: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.492199 22 net.cpp:591] [Forward] Layer relu7, top blob fc7 data: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.492300 22 net.cpp:591] [Forward] Layer drop7, top blob fc7 data: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.492488 22 net.cpp:591] [Forward] Layer fc8_cat, top blob fc8 data: 0.00944262
2018-06-24 13:45:32 PSTI0624 20:45:32.492555 22 net.cpp:603] [Forward] Layer fc8_cat, param blob 0 data: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.492619 22 net.cpp:603] [Forward] Layer fc8_cat, param blob 1 data: 0.00944262
2018-06-24 13:45:32 PSTI0624 20:45:32.492844 22 net.cpp:591] [Forward] Layer prob, top blob (automatic) data: 1.79202
2018-06-24 13:45:32 PSTI0624 20:45:32.492954 22 net.cpp:619] [Backward] Layer prob, bottom blob fc8 diff: 0.00868093
2018-06-24 13:45:32 PSTI0624 20:45:32.493074 22 net.cpp:619] [Backward] Layer fc8_cat, bottom blob fc7 diff: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.493140 22 net.cpp:630] [Backward] Layer fc8_cat, param blob 0 diff: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.493204 22 net.cpp:630] [Backward] Layer fc8_cat, param blob 1 diff: 0.0208672
2018-06-24 13:45:32 PSTI0624 20:45:32.493306 22 net.cpp:619] [Backward] Layer drop7, bottom blob fc7 diff: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.493403 22 net.cpp:619] [Backward] Layer relu7, bottom blob fc7 diff: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.496160 22 net.cpp:619] [Backward] Layer fc7, bottom blob fc6 diff: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.496706 22 net.cpp:630] [Backward] Layer fc7, param blob 0 diff: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.496806 22 net.cpp:630] [Backward] Layer fc7, param blob 1 diff: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.496896 22 net.cpp:619] [Backward] Layer drop6, bottom blob fc6 diff: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.496992 22 net.cpp:619] [Backward] Layer relu6, bottom blob fc6 diff: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.509070 22 net.cpp:630] [Backward] Layer fc6, param blob 0 diff: 0
2018-06-24 13:45:32 PSTI0624 20:45:32.509187 22 net.cpp:630] [Backward] Layer fc6, param blob 1 diff: 0
2018-06-24 13:45:32 PSTE0624 20:45:32.526118 22 net.cpp:719] [Backward] All net params (data, diff): L1 norm = (111.926, 0.125203); L2 norm = (1.18177, 0.0578014)
Upvotes: 1
Views: 154
Reputation: 114786
Looking at the debug log you posted, you can clearly see what went wrong:
...] [Forward] Layer conv1, top blob conv1 data: 3.80577 # looks good - non-zero signal ...] [Forward] Layer conv1, param blob 0 data: 0.00792726 # non-zero kernels ...] [Forward] Layer conv1, param blob 1 data: 0 # bias (not so important) . . . ...] [Forward] Layer conv2, top blob conv2 data: 0 # no output signal !!! ...] [Forward] Layer conv2, param blob 0 data: 0 # kernels are all zero !!! ...] [Forward] Layer conv2, param blob 1 data: 0
The kernels (weights) of conv2
are all zero, thus you have all zeros blobs coming out of this layer and everything from there is zero - you cannot learn this way.
Let's take a closer look at the way conv1
(good layer) and conv2
(bad layer) are defined in your prorotxt:
layers { name: "conv1" type: CONVOLUTION bottom: "data" top: "conv1" convolution_param { num_output: 96 kernel_size: 7 stride: 2 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } . . . layers { name: "conv2" type: CONVOLUTION bottom: "pool1" top: "conv2" convolution_param { num_output: 256 pad: 2 kernel_size: 5 } }
Can you see the difference?
While conv1
has weight_filler
defined (of type: gaussian
), conv2
has no weight_filler
! Caffe, by default, init the kernels/weights of conv2
to zero and everything went south from that point on...
Upvotes: 2