Tensorflow 2.1 full memory and tf.function called twice

Question

I'm developing a Convolutional Autoencoder with Tensorflow 2.1.

This is the code

class ConvAutoencoder:

def __init__(self, input_shape, latent_dim):
    self.input_shape = input_shape
    self.latent_dim = latent_dim
    self.__create_model()

def __create_model(self):
    # Define Encoder
    encoder_input = Input(shape=self.input_shape, name='encoder_input')
    x = Conv2D(filters=16, kernel_size=5, activation='relu', padding='same')(encoder_input)
    x = Conv2D(filters=32, kernel_size=3, strides=2, activation='relu', padding='same')(x)
    x = Conv2D(filters=64, kernel_size=3, strides=2, activation='relu', padding='same')(x)
    x = Conv2D(filters=128, kernel_size=2, strides=2, activation='relu', padding='same')(x)
    last_conv_shape = x.shape
    x = Flatten()(x)
    x = Dense(256, activation='relu')(x)
    x = Dense(units=self.latent_dim, name='encoded_rep')(x)
    self.encoder = Model(encoder_input, x, name='encoder_model')
    self.encoder.summary()

    # Define Decoder
    decoder_input = Input(shape=self.latent_dim, name='decoder_input')
    x = Dense(units=256)(decoder_input)
    x = Dense(units=(last_conv_shape[1] * last_conv_shape[2] * last_conv_shape[3]), activation='relu')(x)
    x = Reshape(target_shape=(last_conv_shape[1], last_conv_shape[2], last_conv_shape[3]))(x)
    x = Conv2DTranspose(filters=128, kernel_size=2, activation='relu', padding='same')(x)
    x = Conv2DTranspose(filters=64, kernel_size=3, strides=2, activation='relu', padding='same')(x)
    x = Conv2DTranspose(filters=32, kernel_size=3, strides=2, activation='relu', padding='same')(x)
    x = Conv2DTranspose(filters=16, kernel_size=5, strides=2, activation='relu', padding='same')(x)
    x = Conv2DTranspose(filters=self.input_shape[2], kernel_size=5, activation='sigmoid', padding='same')(x)
    self.decoder = Model(decoder_input, x, name='decoder_model')
    self.decoder.summary()

    # Define Autoencoder from encoder input to decoder output
    self.autoencoder = Model(encoder_input, self.decoder(self.encoder(encoder_input)))
    self.optimizer = Adam()
    self.autoencoder.summary()


@tf.function
def compute_loss(model, batch):
    decoded = model.autoencoder(batch)
    return tf.reduce_mean(tf.reduce_sum(tf.square(batch - decoded), axis=[1, 2, 3]))


@tf.function
def train(train_data, model, epochs=2, batch_size=32):
    for epoch in range(epochs):
        for i in tqdm(range(0, len(train_data), batch_size)):
            batch = train_data[i: i + batch_size]
            with tf.GradientTape() as tape:
                loss = compute_loss(model, batch)
            gradients = tape.gradient(loss, model.autoencoder.trainable_variables)
            model.optimizer.apply_gradients(zip(gradients, model.autoencoder.trainable_variables))


if __name__ == "__main__":
    img_dim = 64
    channels = 1

    (x_train, _), (x_test, _) = mnist.load_data()
    # Resize images to (img_dim x img_dim)
    x_train = np.array([cv2.resize(img, (img_dim, img_dim)) for img in x_train])
    x_test = np.array([cv2.resize(img, (img_dim, img_dim)) for img in x_test])

    # Normalize images
    x_train = x_train.astype('float32') / 255.
    x_test = x_test.astype('float32') / 255.

    # Reshape datasets for tensorflow
    x_train = x_train.reshape((-1, img_dim, img_dim, channels))
    x_test = x_test.reshape((-1, img_dim, img_dim, channels))

    # Create autoencoder and fit the model
    autoenc = ConvAutoencoder(input_shape=(img_dim, img_dim, channels), latent_dim=4)

    # Train autoencoder
    train(train_data=x_train, model=autoenc, epochs=2, batch_size=32)

Now, the problems are two:

The function train(), which is labeled with @tf.function is called twice. This doesn't happen without the @tf.function label
Each epoch of training increases the memory consumption by around 3GB

What am I doing wrong?

Other info:

Tensorflow version: 2.1.0
Python version 3.7.5
Tensorflow is not using GPU since I still have driver problems

There's nothing much to say in addition but StackOverflow is forcing me to write something

TF_Support · Accepted Answer

For your First problem, when you use @tf.function, the function is executed and traced.
During this Eager execution is disabled in this context, therefore every tf. method just defines a tf.Operation node that produces a tf.Tensor output.

Code debugging 1 :

# Train autoencoder
    train(train_data=x_train, model=autoenc, epochs=5, batch_size=32)

Note: Increased epochs to 5 with shorter dataset for better debugging.

Train Function:

@tf.function
def train(train_data, model, epochs=2, batch_size=32):
    for epoch in range(epochs):
      print("Python execution: ", epoch)   ## This Line only Prints during Python Execution
      tf.print("Graph execution: ", epoch) ## This Line only Print during Graph Execution

      # for i in tqdm(range(0, len(train_data), batch_size)): ## RAISES ERROR
      for i in range(0, len(train_data), batch_size):
          batch = train_data[i: i + batch_size]
          with tf.GradientTape() as tape:
              loss = compute_loss(model, batch)
          gradients = tape.gradient(loss, model.autoencoder.trainable_variables)
          model.optimizer.apply_gradients(zip(gradients, model.autoencoder.trainable_variables))

Here is the output of your original code when debugged with Python print( ) and Tensorflow print tf.print( ) function.
You can see that the function looks like "executed" twice, but it is for tracing and execution to build the Graph, but the succeeding calls for this function is already using the AutoGraph generated.

Observing this, It is better to use the epoch outside the training loop when optimizing with @tf.function.

Code debugging 2 :

    # Train autoencoder
    epochs = 5
    print('Loop Training using Dataset (Epochs : {})'.format(epochs))
    for epoch in range(epochs):
      train(train_data=x_train, model=autoenc, batch_size = 32)

Train Function:

@tf.function
def train(train_data, model, batch_size=32):
      print("Python execution")   ## This Line only Prints during Python Execution
      tf.print("Graph execution") ## This Line only Print during Graph Execution

      # for i in tqdm(range(0, len(train_data), batch_size)):
      for i in range(0, len(train_data), batch_size):
          batch = train_data[i: i + batch_size]
          with tf.GradientTape() as tape:
              loss = compute_loss(model, batch)
          gradients = tape.gradient(loss, model.autoencoder.trainable_variables)
          model.optimizer.apply_gradients(zip(gradients, model.autoencoder.trainable_variables))
      print("#################") # For Debugging Purpose

Here is the output of the modified flow and function, you can still see that the function is "executed" twice. And executes the training using the AutoGraph built for the 5 Epochs. Here every succeeding call for train function is already executed in Graph, resulting in a shorter time of execution due to Tensorflow Optimizations.

For your Second problem, with regards to running out of memory.

You could try using Tensorflow Dataset Generators, rather than loading your entire Dataset in memory.

You could read more about this in this link.

Tensorflow 2.1 full memory and tf.function called twice

Answers (1)

Related Questions