safetyduck
safetyduck

Reputation: 6854

TensorBoard scalar summaries are single data points. How to fix?

tf.train.MonitoredSession

I am logging using the tf.summary.scalar method AND with tf.train.LoggingTensorHook for some tensors. This is with the tf.estimator.Estimator framework.

The tf.train.LoggingTensorHook stuff is not even showing up AFAIK. The other stuff is showing but apparently without time steps.

Graphs and everything else (weights) look ok in tensorboard.

UPDATE: it looks like calling train multiple times results in a graph. Is there something about steps and every_n_iter that do not interact as expected?

import numpy as np
import tensorflow as tf

m = 10000
n = 5
X = np.random.randn(m, n)
A = np.random.randn(n)
y = X.dot(A) + np.random.randn(m) * 0.1

batch_size = 1024

def input_fn(batch_size):
    ds = tf.data.Dataset.from_tensor_slices(dict(X=X, y=y))
    ds = ds.repeat(-1)
    ds = ds.batch(batch_size)
    return ds

def model_fn(features, labels, mode, params):
    X = features['X']
    y = features['y']
    l = X
    for i, k in enumerate([32, 16, 16]):
        l = tf.layers.dense(inputs=l, units=k, name=f'l_{i}', activation=tf.nn.tanh)
    some_thing = tf.reduce_sum(l, axis=1, name='some_thing')
    l = tf.layers.dense(inputs=l, units=1, name='l_final')
    predictions = tf.squeeze(l, axis=-1)
    loss = tf.losses.mean_squared_error(y, predictions, weights=1.0)
    metric_ops = {"mse": tf.metrics.mean_squared_error(labels=y, predictions=predictions)}
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
        train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, eval_metric_ops=metric_ops)
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {}
        return tf.estimator.EstimatorSpec(mode, predictions=predictions)
    if mode == tf.estimator.ModeKeys.EVAL:
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=metric_ops)
    raise Exception('should not hit this')

model = tf.estimator.Estimator(
        model_fn=model_fn,
        model_dir='/tmp/junk',
        config=None,
        params=dict(),
        warm_start_from=None
        )

tensors_to_log = dict(some_thing='some_thing')
logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=10)

train_input_fn = lambda: input_fn(batch_size)
test_input_fn = lambda: input_fn(batch_size)

train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, hooks=[logging_hook], max_steps=100)
eval_spec = tf.estimator.EvalSpec(input_fn=test_input_fn, hooks=[logging_hook])
out = tf.estimator.train_and_evaluate(model, train_spec, eval_spec)

UPDATE: This one does not show in tensorboard until the end of the run and then it only shows one point too.

import numpy as np
import tensorflow as tf
# tf.enable_eager_execution()
tf.logging.set_verbosity(tf.logging.INFO)

m = 10000
n = 5
X = np.random.randn(m, n)
A = np.random.randn(n)
y = X.dot(A) + np.random.randn(m) * 0.1

steps = 1000
batch_size = 1024

def input_fn(repeat, batch_size):
    ds = tf.data.Dataset.from_tensor_slices(dict(X=X, y=y))
    ds = ds.repeat(repeat)
    ds = ds.batch(batch_size)
    return ds

def model_fn(features, labels, mode, params):
    X = features['X']
    y = features['y']
    l = X
    for i, k in enumerate([32, 16, 16]):
        l = tf.layers.dense(inputs=l, units=k, name=f'l_{i}', activation=tf.nn.tanh)
    some_thing = tf.reduce_sum(l, axis=1, name='some_thing')
    l = tf.layers.dense(inputs=l, units=1, name='l_final')
    predictions = tf.squeeze(l, axis=-1)
    loss = tf.losses.mean_squared_error(y, predictions, weights=1.0)
    metric_ops = {"mse": tf.metrics.mean_squared_error(labels=y, predictions=predictions)}

    tf.summary.scalar('summary_loss', loss) # plot a dist across the batch
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
        train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, eval_metric_ops=metric_ops)
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {}
        return tf.estimator.EstimatorSpec(mode, predictions=predictions)
    if mode == tf.estimator.ModeKeys.EVAL:
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=metric_ops)
    raise Exception('should not hit this')

model = tf.estimator.Estimator(
        model_fn=model_fn,
        model_dir='/tmp/junk',
        config=None,
        params=dict(),
        warm_start_from=None
        )

tensors_to_log = dict(some_thing='some_thing')
logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=10)

train_input_fn = lambda: input_fn(steps, batch_size)
test_input_fn = lambda: input_fn(steps, batch_size)

train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, hooks=[logging_hook], max_steps=None)
eval_spec = tf.estimator.EvalSpec(input_fn=test_input_fn, hooks=[logging_hook])
out = tf.estimator.train_and_evaluate(model, train_spec, eval_spec)

Upvotes: 4

Views: 1974

Answers (1)

Ayman Salama
Ayman Salama

Reputation: 449

I ran into a very similar problem. and I was surprised that the solution is very simple. Close TensorBoard and initiated again, and wait for a few minutes. it takes time to catch up. For some reason, if you initiate TensorBoard during the training it will stuck. I hope this will help. I was running the code on google cloud

from google.datalab.ml import TensorBoard
TensorBoard().start('gs://{}/directoy_where_my_models_are'.format(BUCKET))

enter image description here

Upvotes: 2

Related Questions