Reputation: 524
Im creating an Autoencoder as part of my full model for a Kaggle competition. Im trying to tie the weighs of the Encoder, transposed to the Decoder. Before the first Epoch the weights are correctly sync, after that, the Decoder weights just freeze, and dont keep up with the Encoder weights that are being updated by the Gradient Descent.
I look for 12 hours in almost every post about this problem i reach on google, no one seems to have the answer for my case. The closest one is this Tying Autoencoder Weights in a Dense Keras Layer but the problem was solved by not using a variable tensor as kernel, but im already not using that type of tensor as my Decoder kernel, so was no usefull.
Im using a DenseTied Keras custom Layer class defined in this article https://towardsdatascience.com/build-the-right-autoencoder-tune-and-optimize-using-pca-principles-part-ii-24b9cca69bd6, is exactly the same, just change the way i reference the Keras backed for suit my imports style.
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
This is the custom layer definition
class DenseTied(tf.keras.layers.Layer):
def __init__(self, units,
activation=None,
use_bias=True,
kernel_initializer='glorot_uniform',
bias_initializer='zeros',
kernel_regularizer=None,
bias_regularizer=None,
activity_regularizer=None,
kernel_constraint=None,
bias_constraint=None,
tied_to=None,
**kwargs):
self.tied_to = tied_to
if 'input_shape' not in kwargs and 'input_dim' in kwargs:
kwargs['input_shape'] = (kwargs.pop('input_dim'),)
super().__init__(**kwargs)
self.units = units
self.activation = tf.keras.activations.get(activation)
self.use_bias = use_bias
self.kernel_initializer = tf.keras.initializers.get(kernel_initializer)
self.bias_initializer = tf.keras.initializers.get(bias_initializer)
self.kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
self.bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
self.activity_regularizer = tf.keras.regularizers.get(activity_regularizer)
self.kernel_constraint = tf.keras.constraints.get(kernel_constraint)
self.bias_constraint = tf.keras.constraints.get(bias_constraint)
self.input_spec = tf.keras.layers.InputSpec(min_ndim=2)
self.supports_masking = True
def build(self, input_shape):
assert len(input_shape) >= 2
input_dim = input_shape[-1]
if self.tied_to is not None:
self.kernel = tf.keras.backend.transpose(self.tied_to.kernel)
self.non_trainable_weights.append(self.kernel)
else:
self.kernel = self.add_weight(shape=(input_dim, self.units),
initializer=self.kernel_initializer,
name='kernel',
regularizer=self.kernel_regularizer,
constraint=self.kernel_constraint)
if self.use_bias:
self.bias = self.add_weight(shape=(self.units,),
initializer=self.bias_initializer,
name='bias',
regularizer=self.bias_regularizer,
constraint=self.bias_constraint)
else:
self.bias = None
self.input_spec = tf.keras.layers.InputSpec(min_ndim=2, axes={-1: input_dim})
self.built = True
def compute_output_shape(self, input_shape):
assert input_shape and len(input_shape) >= 2
output_shape = list(input_shape)
output_shape[-1] = self.units
return tuple(output_shape)
def call(self, inputs):
output = tf.keras.backend.dot(inputs, self.kernel)
if self.use_bias:
output = tf.keras.backend.bias_add(output, self.bias, data_format='channels_last')
if self.activation is not None:
output = self.activation(output)
return output
And this the model training and testing with a dummy data set
rand_samples = np.random.rand(16, 51)
dummy_ds = tf.data.Dataset.from_tensor_slices((rand_samples, rand_samples)).shuffle(16).batch(16)
encoder = tf.keras.layers.Dense(1, activation="linear", input_shape=(51,), use_bias=True)
decoder = DenseTied(51, activation="linear", tied_to=encoder, use_bias=True)
autoencoder = tf.keras.Sequential()
autoencoder.add(encoder)
autoencoder.add(decoder)
autoencoder.compile(metrics=['accuracy'],
loss='mean_squared_error',
optimizer='sgd')
autoencoder.summary()
print("Encoder Kernel Before 1 Epoch", encoder.kernel[0])
print("Decoder Kernel Before 1 Epoch", decoder.kernel[0][0])
autoencoder.fit(dummy_ds, epochs=1)
print("Encoder Kernel After 1 Epoch", encoder.kernel[0])
print("Decoder Kernel After 1 Epoch", decoder.kernel[0][0])
The Expected output is have the two kernels exactly the same in the first element (Print just one weight for simplicity)
The current output shows that Decoder Kernel is not updated as the same as the Transposed Encoder Kernel
2019-09-06 14:55:42.070003: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library nvcuda.dll
2019-09-06 14:55:42.984580: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1640] Found device 0 with properties:
name: GeForce GTX 1060 major: 6 minor: 1 memoryClockRate(GHz): 1.733
pciBusID: 0000:01:00.0
2019-09-06 14:55:43.088109: I tensorflow/stream_executor/platform/default/dlopen_checker_stub.cc:25] GPU libraries are statically linked, skip dlopen check.
2019-09-06 14:55:43.166145: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1763] Adding visible gpu devices: 0
2019-09-06 14:55:43.203865: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2
2019-09-06 14:55:43.277988: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1640] Found device 0 with properties:
name: GeForce GTX 1060 major: 6 minor: 1 memoryClockRate(GHz): 1.733
pciBusID: 0000:01:00.0
2019-09-06 14:55:43.300888: I tensorflow/stream_executor/platform/default/dlopen_checker_stub.cc:25] GPU libraries are statically linked, skip dlopen check.
2019-09-06 14:55:43.309040: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1763] Adding visible gpu devices: 0
2019-09-06 14:55:44.077814: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1181] Device interconnect StreamExecutor with strength 1 edge matrix:
2019-09-06 14:55:44.094542: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1187] 0
2019-09-06 14:55:44.099411: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1200] 0: N
2019-09-06 14:55:44.103424: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1326] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 4712 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1060, pci bus id: 0000:01:00.0, compute capability: 6.1)
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense (Dense) (None, 1) 52
_________________________________________________________________
dense_tied (DenseTied) (None, 51) 103
=================================================================
Total params: 103
Trainable params: 103
Non-trainable params: 0
_________________________________________________________________
Encoder Kernel Before 1 Epoch tf.Tensor([0.20486075], shape=(1,), dtype=float32)
Decoder Kernel Before 1 Epoch tf.Tensor(0.20486075, shape=(), dtype=float32)
1/1 [==============================] - 1s 657ms/step - loss: 0.3396 - accuracy: 0.0000e+00
Encoder Kernel After 1 Epoch tf.Tensor([0.20530733], shape=(1,), dtype=float32)
Decoder Kernel After 1 Epoch tf.Tensor(0.20486075, shape=(), dtype=float32)
PS C:\Users\whitm\Desktop\CodeProjects\ForestClassifier-DEC>
i dont see what im doing wrong.
Upvotes: 6
Views: 1847
Reputation: 11895
To tie the weights, I would suggest using the Keras functional API which enables to share layers. That said, here is an alternative implementation that ties the weights between the encoder and decoder:
class TransposableDense(tf.keras.layers.Dense):
def __init__(self, units, **kwargs):
super().__init__(units, **kwargs)
def build(self, input_shape):
assert len(input_shape) >= 2
input_dim = input_shape[-1]
self.t_output_dim = input_dim
self.kernel = self.add_weight(shape=(int(input_dim), self.units),
initializer=self.kernel_initializer,
name='kernel',
regularizer=self.kernel_regularizer,
constraint=self.kernel_constraint)
if self.use_bias:
self.bias = self.add_weight(shape=(self.units,),
initializer=self.bias_initializer,
name='bias',
regularizer=self.bias_regularizer,
constraint=self.bias_constraint)
self.bias_t = self.add_weight(shape=(input_dim,),
initializer=self.bias_initializer,
name='bias_t',
regularizer=self.bias_regularizer,
constraint=self.bias_constraint)
else:
self.bias = None
self.bias_t = None
# self.input_spec = tf.keras.layers.InputSpec(min_ndim=2, axes={-1: input_dim})
self.built = True
def call(self, inputs, transpose=False):
bs, input_dim = inputs.get_shape()
kernel = self.kernel
bias = self.bias
if transpose:
assert input_dim == self.units
kernel = tf.keras.backend.transpose(kernel)
bias = self.bias_t
output = tf.keras.backend.dot(inputs, kernel)
if self.use_bias:
output = tf.keras.backend.bias_add(output, bias, data_format='channels_last')
if self.activation is not None:
output = self.activation(output)
return output
def compute_output_shape(self, input_shape):
bs, input_dim = input_shape
output_dim = self.units
if input_dim == self.units:
output_dim = self.t_output_dim
return bs, output_dim
The kernel of this dense layer can be transposed by calling the layer with transpose=True
. Note that this might break some basic Keras principles (e.g. the layer has multiple output shapes), but it should work for your case.
Here is an example showing how you can use it to define your model:
a = tf.keras.layers.Input((51,))
dense = TransposableDense(1, activation='linear', use_bias=True)
encoder_out = dense(a)
decoder_out = dense(encoder_out, transpose=True)
encoder = tf.keras.Model(a, encoder_out)
autoencoder = tf.keras.Model(a, decoder_out)
Upvotes: 1
Reputation: 10474
The weights are not tied. You are just initializing the weights of the tied layer with the transposed weights of the first layer and then never train them. transpose
returns a new tensor/different object and add_weight
creates a new variable, thus any relation between the two layers is lost after build
. I think it would be better do so something like this:
def call(self, inputs):
output = tf.keras.backend.dot(inputs, tf.keras.backend.transpose(self.tied_to.kernel))
if self.use_bias:
output = tf.keras.backend.bias_add(output, self.tied_to.bias, data_format='channels_last')
if self.activation is not None:
output = self.activation(output)
return output
Here, the tied layer always uses the first layer's weights explicitly and would not have any weights itself (i.e. remove the add_weight
parts from build
).
Upvotes: 0