Masking zero-padding embedding (and return zero gradients) in Tensorflow as in Pytorch

Question

I'm trying to recreate the PoolNet from Spotlight with the BPR loss in Tensorflow but I can't get the same results. Below is the model I'm using (it's an estimator model_fn).

def _pooling_model_fn(features, labels, mode, params):
 with tf.name_scope('inputs'):
    if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
        users_prev_items_inputs_train = features['item_seqs']
    elif mode == tf.estimator.ModeKeys.PREDICT:
        users_prev_items_inputs_train = tf.reshape(features['item_seqs'], [1, -1])

 with tf.device('/cpu:0'):
    prod_embeddings = tf.keras.layers.Embedding(params["num_items"], params["item_emb_size"], mask_zero=True)
    item_biases = tf.keras.layers.Embedding(params["num_items"], 1, mask_zero=True, embeddings_initializer=tf.keras.initializers.Zeros())
    prod_embed = prod_embeddings(users_prev_items_inputs_train)
    targets = tf.transpose(prod_embed, [0, 2, 1])

 sequence_embeddings = tf.expand_dims(targets, axis=3)

 sequence_embeddings = tf.pad(sequence_embeddings, paddings=tf.constant([[0, 0], [0, 0], [1, 0], [0, 0]]))

 sequence_embedding_sum = tf.cumsum(sequence_embeddings, 2)

 non_padding_entries = tf.cumsum(tf.cast(tf.not_equal(sequence_embeddings, tf.constant(0.0)), tf.float32), 2)  # .expand_as(sequence_embedding_sum)

 user_representations = tf.squeeze((sequence_embedding_sum / (non_padding_entries + 1)), [3])

 user_representations_so_far = user_representations[:, :, :-1]
 user_representations_new = user_representations[:, :, -1]

 if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
    global_step = tf.contrib.framework.get_or_create_global_step()

    with tf.name_scope('loss'):
        negative_samples = features['neg_samp']

        with tf.device('/cpu:0'):
            prod_embed_pos = prod_embeddings(users_prev_items_inputs_train)
            target_embedding_positive = tf.squeeze(tf.transpose(prod_embed_pos, [0, 2, 1]))

            prod_bias_pos = item_biases(users_prev_items_inputs_train) 
            target_bias_positive = tf.squeeze(prod_bias_pos)

        dot_positive = tf.reduce_sum(user_representations_so_far * target_embedding_positive, 1) + target_bias_positive

        with tf.device('/cpu:0'):
            prod_embed_neg = prod_embeddings(negative_samples)
            target_embedding_negative = tf.squeeze(tf.transpose(prod_embed_neg, [0, 2, 1]))

            prod_bias_neg = item_biases(negative_samples)
            target_bias_negative = tf.squeeze(prod_bias_neg)

        dot_negative = tf.reduce_sum(user_representations_so_far * target_embedding_negative, 1) + target_bias_negative

        mask = tf.not_equal(users_prev_items_inputs_train, 0)

        loss = bpr_loss(dot_positive, dot_negative, mask)

 if mode == tf.estimator.ModeKeys.TRAIN:
    with tf.name_scope('optimizer'):
        optimizer = tf.train.AdamOptimizer(learning_rate=params["lr"])
    train_op = optimizer.minimize(loss, global_step=global_step)
    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)

 if mode == tf.estimator.ModeKeys.PREDICT:
    item_ids = np.arange(params['num_items']).reshape(-1, 1)
    item_ids_tensor = tf.convert_to_tensor(item_ids, dtype=tf.int64)

    with tf.device('/cpu:0'):
        prod_embed_pos = prod_embeddings(item_ids_tensor)  # tf.nn.embedding_lookup(prod_embeddings, item_ids_tensor)
        target_embedding_positive = tf.squeeze(tf.transpose(prod_embed_pos, [0, 2, 1]))

        prod_bias_pos = item_biases(item_ids_tensor)  # tf.nn.embedding_lookup(item_biases, item_ids_tensor)
        target_bias_positive = tf.squeeze(prod_bias_pos)

    dot_positive = tf.reduce_sum(user_representations_new * target_embedding_positive, 1) + target_bias_positive

    predictions = {
        'products': tf.reshape(dot_positive, [1, -1])
    }
    export_outputs = {
        'prediction': tf.estimator.export.PredictOutput(predictions)
    }
    return tf.estimator.EstimatorSpec(mode, predictions=predictions, export_outputs=export_outputs)

and the loss function

def bpr_loss(positive_predictions, negative_predictions, mask):
 loss1 = 1.0 - tf.nn.sigmoid(positive_predictions - negative_predictions)

 if mask is not None:
    mask = tf.cast(mask, loss1.dtype)
    final_loss = loss1 * mask
    return tf.reduce_sum(final_loss) / tf.reduce_sum(mask)

 return tf.reduce_mean(loss1)

With the above model, I can't get the same predictions on the exact same dataset (and same random seed) as I do with Spotlight. I end up that the problem is with the zero-padding. The way that the data is generated is as the following:

[[0,0,0,5,6,98],
 [0,62,15,4,8,47],
 [0,0,5,9,6,3,41],
 [78,21,2,56,1,3]]

they have leading zero-padding so every input sample has the same length.

Based on my code I believed I did everything to mask out these zeros from the loss, the embedding layer (using the mask_zero parameter from Keras) as well as from the averaging of the embeddings that I'm computing (using the cumsum). Still though, after training, the zero-indexed embedding is constantly changing (meaning that instead of excluded is taken into consideration and leading to influence the rest gradients and adding noise to my results).

Pytorch seems to have a nice feature in their implementation of the Embedding layer where you can set the padding_idx with the id of the pad and this will be initialized with zeros. Also, it keeps the gradient of this index always zero. So basically, I'm trying to do the same thing with Tensorflow.

Any help would be appreciated.

billiout · Accepted Answer

I solved it using the following solution posted on Tensorflow's Github. It seems to work now.

mask_padding_zero_op = tf.scatter_update(lookup_table, 
                                     PADDING_ID, 
                                     tf.zeros([EMBEDDING_DIM,], dtype=DTYPE))

with tf.control_dependencies([mask_padding_zero_op]):
    # do embedding lookup...

Masking zero-padding embedding (and return zero gradients) in Tensorflow as in Pytorch

Answers (1)

Related Questions