How does one use keras add_weight() vars with tensorflow probability distributions?

Question

I am creating a new keras layer which accepts a vector of input data and is parameterized by 2 scalars, a mean and standard deviation. I model the input data as a normal distribution and estimate its mean and variance through gradient descent. However, when I initialize tfp.Normal(mu, sigma) which mu and sigma are from add_weights() during, build(), the gradients do not propagate through mu and sigma.

The tensorflow probability documentation states that you can pass in training variables for distribution parameters and backprop through them. How do I get this to work inside of keras?

Below is a minimal working example.

import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np
tfk = tf.keras
tfkl = tf.keras.layers
tfd = tfp.distributions
tfpl = tfp.layers
EPS = 1e-5

batch_size = 4
N = 100
x = np.random.randn(batch_size, N)

class NormalLikelihood(tf.keras.layers.Layer):
    def __init__(self):
        super(NormalLikelihood, self).__init__()

    def build(self, input_shape):
        self.mu = self.add_weight("mean", shape=[1], initializer=tf.keras.initializers.RandomNormal(mean=0.0, stddev=1), dtype=tf.float32)
        self.sigma = self.add_weight("std", shape=[1], initializer=tf.keras.initializers.RandomUniform(minval=EPS, maxval=5.0, seed=None), constraint=tf.keras.constraints.non_neg(), dtype=tf.float32)
        self.distribution = tfp.distributions.Normal(self.mu[0], self.sigma[0])

    def call(self, input):
        r = self.distribution.prob(input)
        r = tf.clip_by_value(r, 1e-3, 1-1e-3)
        return r

input_layer = tf.keras.layers.Input(shape=(100,))
r = NormalLikelihood()(input_layer)
r = -tf.reduce_sum(tf.math.log(r))
model = tf.keras.models.Model(input_layer, r)
model.add_loss(r)
model.compile(optimizer='rmsprop', loss=None)
model.fit(x, y=None)

This code results in builtins.ValueError: No gradients provided for any variable: ['normal_likelihood/mean:0', 'normal_likelihood/std:0'] which is not expected. Desired behavior would be that ['normal_likelihood/mean:0', 'normal_likelihood/std:0'] have gradients provided for them.

See the code in google colab: https://colab.research.google.com/drive/1_u4XTCIH-2qwNSgv9zkZiCG_zeCIEZGp?usp=sharing

o-90 · Accepted Answer

Change tfp.distributions.Normal(self.mu[0], self.sigma[0]) to tfp.distributions.Normal(self.mu, self.sigma).

The reason this works is because under the hood of the .fit() keras method, the gradient computation is looking for trainable variables. When you index into the weights of the model you're taking the gradient against a constant that destroys the connectivity of the chain rule.

Example:

import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp


EPS = 1e-5


class NormalLikelihoodYours(tf.keras.layers.Layer):
    def __init__(self):
        super(NormalLikelihoodYours, self).__init__()

    def build(self, input_shape):
        self.mu = self.add_weight(
            "mean", shape=[1],
            initializer=tf.keras.initializers.RandomNormal(
                mean=0.0, stddev=1), dtype=tf.float32)
        self.sigma = self.add_weight(
            "std", shape=[1],
            initializer=tf.keras.initializers.RandomUniform(
                minval=EPS, maxval=5.0, seed=None),
            constraint=tf.keras.constraints.non_neg(),
            dtype=tf.float32)
        self.distribution = tfp.distributions.Normal(self.mu[0], self.sigma[0])

    def call(self, input):
        r = self.distribution.prob(input)
        r = tf.clip_by_value(r, 1e-3, 1-1e-3)
        return r
    

class NormalLikelihoodMine(tf.keras.layers.Layer):
    def __init__(self):
        super(NormalLikelihoodMine, self).__init__()

    def build(self, input_shape):
        self.mu = self.add_weight(
            "mean", shape=[1],
            initializer=tf.keras.initializers.RandomNormal(
                mean=0.0, stddev=1), dtype=tf.float32)
        self.sigma = self.add_weight(
            "std", shape=[1],
            initializer=tf.keras.initializers.RandomUniform(
                minval=EPS, maxval=5.0, seed=None),
            constraint=tf.keras.constraints.non_neg(),
            dtype=tf.float32)
        self.distribution = tfp.distributions.Normal(self.mu, self.sigma)

    def call(self, input):
        r = self.distribution.prob(input)
        r = tf.clip_by_value(r, 1e-3, 1-1e-3)
        return r

# loss function
def calc_loss(logits):
    return -tf.math.reduce_sum(tf.math.log(logits))

# model input
input_layer = tf.keras.layers.Input(shape=(100,))
x_in = tf.random.normal([4, 100])

# your model
your_output = NormalLikelihoodYours()(input_layer)
your_model = tf.keras.models.Model(input_layer, your_output)\

# my model
my_output = NormalLikelihoodMine()(input_layer)
my_model = tf.keras.models.Model(input_layer, my_output)

# yours has no gradients because the network weights are not
# included anywhere in the loss calculation. When you index them
# with `[0]` they go from being trainable variables in the network,
# to just constants.
with tf.GradientTape() as tape:
    y_hat = your_model(x_in)
    loss = calc_loss(y_hat)
    
print(tape.gradient(loss, your_model.trainable_variables))
# [None, None]

# my model has gradients because `loss` and the weights in
# `trainable_variables` are connected
with tf.GradientTape() as tape:
    y_hat = my_model(x_in)
    loss = calc_loss(y_hat)
    
print(tape.gradient(loss, my_model.trainable_variables))
# [,
#  ]

How does one use keras add_weight() vars with tensorflow probability distributions?

Answers (1)

Related Questions