Why does the lasso here didn't provide me with zero coefficient?

Question

I got the idea of implementing my version of deep feature selection is from the paper here,http://link.springer.com/chapter/10.1007%2F978-3-319-16706-0_20

The basic idea of deep feature selection according to this paper is to add a one to one mapping layer before any full connected hidden layer, then by adding a regularization term (whether lasso or elastic net) to produce zeros in the input layer weights.

My question is, even though it seems I have implemented the deep feature selection framework well, while testing on the random data generated by numpy.rand.random(1000,50) fails to give me any zeros on the initial weight. Is is a common thing for lasso like regularization? Am I going to adjust the parameters I used for this framework (even larger epochs)? Or did I do something wrong with my code.

class DeepFeatureSelectionMLP:
    def __init__(self, X, Y, hidden_dims=[100], epochs=1000,
                 lambda1=0.001, lambda2=1.0, alpha1=0.001, alpha2=0.0, learning_rate=0.1):
        # Initiate the input layer

        # Get the dimension of the input X
        n_sample, n_feat = X.shape
        n_classes = len(np.unique(Y))

        # One hot Y
        one_hot_Y = np.zeros((len(Y), n_classes))
        for i,j in enumerate(Y):
            one_hot_Y[i][j] = 1

        self.epochs = epochs

        Y = one_hot_Y

        # Store up original value
        self.X = X
        self.Y = Y

        # Two variables with undetermined length is created
        self.var_X = tf.placeholder(dtype=tf.float32, shape=[None, n_feat], name='x')
        self.var_Y = tf.placeholder(dtype=tf.float32, shape=[None, n_classes], name='y')

        self.input_layer = One2OneInputLayer(self.var_X)

        self.hidden_layers = []
        layer_input = self.input_layer.output

        # Create hidden layers
        for dim in hidden_dims:
            self.hidden_layers.append(DenseLayer(layer_input, dim))
            layer_input = self.hidden_layers[-1].output

        # Final classification layer, variable Y is passed
        self.softmax_layer = SoftmaxLayer(self.hidden_layers[-1].output, n_classes, self.var_Y)

        n_hidden = len(hidden_dims)

        # regularization terms on coefficients of input layer 
        self.L1_input = tf.reduce_sum(tf.abs(self.input_layer.w))
        self.L2_input = tf.nn.l2_loss(self.input_layer.w)

        # regularization terms on weights of hidden layers        
        L1s = []
        L2_sqrs = []
        for i in xrange(n_hidden):
            L1s.append(tf.reduce_sum(tf.abs(self.hidden_layers[i].w)))
            L2_sqrs.append(tf.nn.l2_loss(self.hidden_layers[i].w))

        L1s.append(tf.reduce_sum(tf.abs(self.softmax_layer.w)))
        L2_sqrs.append(tf.nn.l2_loss(self.softmax_layer.w))

        self.L1 = tf.add_n(L1s)
        self.L2_sqr = tf.add_n(L2_sqrs)

        # Cost with two regularization terms
        self.cost = self.softmax_layer.cost \
                    + lambda1*(1.0-lambda2)*0.5*self.L2_input + lambda1*lambda2*self.L1_input \
                    + alpha1*(1.0-alpha2)*0.5 * self.L2_sqr + alpha1*alpha2*self.L1

        self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.cost)

        self.y = self.softmax_layer.y

    def train(self, batch_size=100):
        sess = tf.Session()
        sess.run(tf.initialize_all_variables())

        for i in xrange(self.epochs):
            x_batch, y_batch = get_batch(self.X, self.Y, batch_size)
            sess.run(self.optimizer, feed_dict={self.var_X: x_batch, self.var_Y: y_batch})
            if (i + 1) % 50 == 0:
                l = sess.run(self.cost, feed_dict={self.var_X: x_batch, self.var_Y: y_batch})
                print('epoch {0}: global loss = {1}'.format(i, l))
                self.selected_w = sess.run(self.input_layer.w)
                print(self.selected_w)

class One2OneInputLayer(object):
    # One to One Mapping!
    def __init__(self, input):
        """
            The second dimension of the input,
            for each input, each row is a sample
            and each column is a feature, since 
            this is one to one mapping, n_in equals 
            the number of features
        """
        n_in = input.get_shape()[1].value

        self.input = input

        # Initiate the weight for the input layer
        w = tf.Variable(tf.zeros([n_in,]), name='w')

        self.w = w
        self.output = self.w * self.input
        self.params = [w]

class DenseLayer(object):
    # Canonical dense layer
    def __init__(self, input, n_out, activation='sigmoid'):
        """
            The second dimension of the input,
            for each input, each row is a sample
            and each column is a feature, since 
            this is one to one mapping, n_in equals 
            the number of features

            n_out defines how many nodes are there in the 
            hidden layer
        """
        n_in = input.get_shape()[1].value
        self.input = input

        # Initiate the weight for the input layer

        w = tf.Variable(tf.ones([n_in, n_out]), name='w')
        b = tf.Variable(tf.ones([n_out]), name='b')

        output = tf.add(tf.matmul(input, w), b)
        output = activate(output, activation)

        self.w = w
        self.b = b
        self.output = output
        self.params = [w]

class SoftmaxLayer(object):
    def __init__(self, input, n_out, y):
        """
            The second dimension of the input,
            for each input, each row is a sample
            and each column is a feature, since 
            this is one to one mapping, n_in equals 
            the number of features

            n_out defines how many nodes are there in the 
            hidden layer
        """
        n_in = input.get_shape()[1].value
        self.input = input

        # Initiate the weight and biases for this layer
        w = tf.Variable(tf.random_normal([n_in, n_out]), name='w')
        b = tf.Variable(tf.random_normal([n_out]), name='b')

        pred = tf.add(tf.matmul(input, w), b)

        cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y))

        self.y = y
        self.w = w
        self.b = b
        self.cost = cost
        self.params= [w]

Alexandre Passos · Accepted Answer

Gradient descent algorithms such as Adam do not give exact zeros when using l1 regularization. Instead, something like ftrl or proximal adagrad can give you exact zeros.

Why does the lasso here didn't provide me with zero coefficient?

Answers (1)

Related Questions

Why does the lasso here didn&#39;t provide me with zero coefficient?

Answers (1)

Related Questions

Why does the lasso here didn't provide me with zero coefficient?