abracadabra
abracadabra

Reputation: 381

Training a model with tensorflow,but loss just won't go down?

I am training a model that classifies 62 alphanumeric characters.But The loss value first drastically drops during first few batches and then hit the ground and never go down.I don't know where it is wrong or how to debug the model.

Here is a snapshot of training log:

enter image description here

Here is examples of traning data:

enter image description here

I use 4 conv layers followed by 1 fc layer,adam optimizer to minimize log loss.I double checked that the image label is right.So i don't know where else it is wrong.

Here is the code:

import numpy as np
import tensorflow as tf
import os
from PIL import Image
import shutil
import time

input = temp= tf.placeholder(dtype='float32', shape=(None,32,32,1), name='input')#(None,62)
label = tf.placeholder(dtype='float32',shape=(None,62))#(None,62)

temp = tf.layers.conv2d(inputs=temp,filters=32,kernel_size=(3,3),padding="SAME",activation=tf.nn.relu,kernel_initializer=tf.keras.initializers.he_normal())#(None,32,32,32)
#temp = tf.layers.dropout(inputs=temp,rate=0.5)
temp = tf.layers.max_pooling2d(temp,pool_size=[2, 2], strides=2)#(None,16,16,32)

temp = tf.layers.conv2d(inputs=temp,filters=64,kernel_size=(3,3),padding="SAME",activation=tf.nn.relu,kernel_initializer=tf.keras.initializers.he_normal())#(None,16,16,64)
#temp = tf.layers.dropout(inputs=temp,rate=0.2)
temp = tf.layers.max_pooling2d(temp,pool_size=[2, 2], strides=2)#(None,8,8,64)

temp = tf.layers.conv2d(inputs=temp,filters=128,kernel_size=(3,3),padding="SAME",activation=tf.nn.relu,kernel_initializer=tf.keras.initializers.he_normal())#(None,8,8,128)
temp = tf.layers.dropout(inputs=temp,rate=0.2)
temp = tf.layers.max_pooling2d(temp,pool_size=[2, 2], strides=2)#(None,4,4,128)

temp = tf.layers.conv2d(inputs=temp,filters=256,kernel_size=(3,3),padding="SAME",activation=tf.nn.relu,kernel_initializer=tf.keras.initializers.he_normal())#(None,4,4,256)
temp = tf.layers.dropout(inputs=temp,rate=0.2)
temp = tf.layers.max_pooling2d(temp,pool_size=[2, 2], strides=2)#(None,2,2,256)

temp = tf.layers.conv2d(inputs=temp,filters=62,kernel_size=(2,2),padding="VALID",kernel_initializer=tf.keras.initializers.he_normal())#fc (None,1,1,62)
output = temp = tf.layers.flatten(temp)#(None,62)
output = tf.nn.softmax(output)

#loss
output_clip = tf.clip_by_value(output,1e-7,1-1e-7)
loss = tf.reduce_mean(tf.reduce_sum(-label*tf.log(output_clip)-(1-label)*tf.log(1-output_clip),axis=-1))#scaler
optimizer = tf.train.AdamOptimizer().minimize(loss)

#accuracy
indexoutput = tf.argmax(output,axis=-1)#(None,)
labelindex = tf.argmax(label,axis=-1)#(None,)
equals = tf.equal(indexoutput,labelindex)#(None,)
equals = tf.reduce_sum(tf.cast(equals,dtype='int8'),axis=-1)#scaler
acc = tf.cast(equals,dtype='float32')/tf.cast(tf.shape(output)[0],dtype='float32')#scaler

def train(epochs):
    saver = tf.train.Saver()
    lossrec=[]
    accrec = []
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        #saver.restore(sess, os.path.join(os.getcwd(),'model_logloss','captchabreak.ckpt'))
        valimg,vallabel = next(validategenerator(os.path.join(os.getcwd(),'tests')))
        for i in range(epochs):
            for j, (trainimg, trainlabel) in enumerate(traingenerator(os.path.join(os.getcwd(),'pics'),32)):
                _, trainacc, trainloss = sess.run([optimizer, acc, loss],feed_dict={input: trainimg, label: trainlabel});
                valacc, valloss = sess.run([acc, loss], feed_dict={input: np.array(valimg), label: np.array(vallabel)})
                print("epoch:{} batch:{} trainloss:{:.4f} validateloss:{:.4f} trainacc:{:.2f} validateacc:{:.2f}"
                      .format(i, j, trainloss, valloss, trainacc, valacc))
            #keep some logs
            lossrec.append(valloss)
            accrec.append(valacc)
            print(lossrec);
            print(accrec)
            if len(lossrec) >= 3 and valloss >= lossrec[-2] and valloss >= lossrec[-3]: break;
            shutil.rmtree("model_logloss")
            saver.save(sess, "model_logloss/captchabreak.ckpt")

def traingenerator(path,batch_size):
    fs = os.listdir(path);
    fs=np.random.permutation(fs)
    if batch_size == 0:
        batch_size = len(fs)
    offset=0
    while offset<len(fs):
        yield fetch(fs[offset:offset+batch_size],path)
        offset+=batch_size

def validategenerator(path):
    fs = os.listdir(path);
    yield fetch(fs,path)

def fetch(fs,path):
    imgs = []
    labels = []
    for i, fname in enumerate(fs):
        fp = os.path.join(path, fname)
        imp = Image.open(fp).resize((32, 32));
        imp = imp.convert('L')
        imp = imp.point(lambda p: p > 210 and 255)
        im = np.array(imp)
        im = np.expand_dims(im, axis=-1)
        imp.close()

        c = fname[0]
        lb = np.zeros((62))
        if ord(c) >= 48 and ord(c) <= 57:
            lb[ord(c) - 48] = 1
        if ord(c) >= 65 and ord(c) <= 90:
            lb[ord(c) - 65 + 10] = 1
        if ord(c) >= 97 and ord(c) <= 122:
            lb[ord(c) - 97 + 36] = 1
        imgs.append(im)
        labels.append(lb)
    return np.array(imgs), np.array(labels)

if __name__ == "__main__":
    train(30)

Upvotes: 0

Views: 582

Answers (1)

abracadabra
abracadabra

Reputation: 381

It's solved . I forgot to divide the pixel value by 255.

Upvotes: 3

Related Questions