TPU: NameError: name '_minimize' is not defined when defining Keras custom train_step

Question

I have model that runs just fine on the GPU, but gives an error on TPU.

I am trying to define my own custom model in Tensorflow Keras, code below:

class CustomModel(tf.keras.Model):
    def train_step(self, data):
        # Unpack the data. Its structure depends on your model and
        # on what you pass to `fit()`.
        x = data
        y = tf.constant([1.0], dtype=tf.float32)

        with tf.GradientTape() as tape:
            y_pred = self(x, training=True)  # Forward pass
            # Compute the loss value
            # (the loss function is configured in `compile()`)
            loss = self.compiled_loss(y, y_pred, regularization_losses=self.losses)
        
        _minimize(self.distribute_strategy, tape, self.optimizer, loss,
                self.trainable_variables)

        self.compiled_metrics.update_state(y, y_pred, sample_weight)

but when I try to train, I run into

NameError: name '_minimize' is not defined, even through it is ~~defined in the inherited model class~~ defined in same code as the class. https://github.com/tensorflow/tensorflow/blob/2434d2401399e3973d2f704f977bd6ad2d029ca7/tensorflow/python/keras/engine/training.py#L2699

Here is the full error message

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
 in ()
     13         validation_data=val_dataset,
     14         validation_steps=val_steps,
---> 15         validation_freq=1)

10 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in _method_wrapper(self, *args, **kwargs)
     64   def _method_wrapper(self, *args, **kwargs):
     65     if not self._in_multi_worker_mode():  # pylint: disable=protected-access
---> 66       return method(self, *args, **kwargs)
     67 
     68     # Running inside `run_distribute_coordinator` already.

/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
    846                 batch_size=batch_size):
    847               callbacks.on_train_batch_begin(step)
--> 848               tmp_logs = train_function(iterator)
    849               # Catch OutOfRangeError for Datasets of unknown size.
    850               # This blocks until the batch has finished executing.

/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
    578         xla_context.Exit()
    579     else:
--> 580       result = self._call(*args, **kwds)
    581 
    582     if tracing_count == self._get_tracing_count():

/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
    625       # This is the first call of __call__, so we have to initialize.
    626       initializers = []
--> 627       self._initialize(args, kwds, add_initializers_to=initializers)
    628     finally:
    629       # At this point we know that the initialization is complete (or less

/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in _initialize(self, args, kwds, add_initializers_to)
    504     self._concrete_stateful_fn = (
    505         self._stateful_fn._get_concrete_function_internal_garbage_collected(  # pylint: disable=protected-access
--> 506             *args, **kwds))
    507 
    508     def invalid_creator_scope(*unused_args, **unused_kwds):

/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in _get_concrete_function_internal_garbage_collected(self, *args, **kwargs)
   2444       args, kwargs = None, None
   2445     with self._lock:
-> 2446       graph_function, _, _ = self._maybe_define_function(args, kwargs)
   2447     return graph_function
   2448 

/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in _maybe_define_function(self, args, kwargs)
   2775 
   2776       self._function_cache.missed.add(call_context_key)
-> 2777       graph_function = self._create_graph_function(args, kwargs)
   2778       self._function_cache.primary[cache_key] = graph_function
   2779       return graph_function, args, kwargs

/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in _create_graph_function(self, args, kwargs, override_flat_arg_shapes)
   2665             arg_names=arg_names,
   2666             override_flat_arg_shapes=override_flat_arg_shapes,
-> 2667             capture_by_value=self._capture_by_value),
   2668         self._function_attributes,
   2669         # Tell the ConcreteFunction to clean up its graph once it goes out of

/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/func_graph.py in func_graph_from_py_func(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, override_flat_arg_shapes)
    979         _, original_func = tf_decorator.unwrap(python_func)
    980 
--> 981       func_outputs = python_func(*func_args, **func_kwargs)
    982 
    983       # invariant: `func_outputs` contains only Tensors, CompositeTensors,

/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in wrapped_fn(*args, **kwds)
    439         # __wrapped__ allows AutoGraph to swap in a converted function. We give
    440         # the function a weak reference to itself to avoid a reference cycle.
--> 441         return weak_wrapped_fn().__wrapped__(*args, **kwds)
    442     weak_wrapped_fn = weakref.ref(wrapped_fn)
    443 

/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/func_graph.py in wrapper(*args, **kwargs)
    966           except Exception as e:  # pylint:disable=broad-except
    967             if hasattr(e, "ag_error_metadata"):
--> 968               raise e.ag_error_metadata.to_exception(e)
    969             else:
    970               raise

NameError: in user code:

    /usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py:571 train_function  *
        outputs = self.distribute_strategy.run(
    :14 train_step  *
        _minimize(self.distribute_strategy, tape, self.optimizer, loss,

    NameError: name '_minimize' is not defined

I know that

  gradients = tape.gradient(loss, trainable_variables)
  self.optimizer.apply_gradients(zip(gradients, trainable_variables))

is an equivalent to _minimize, but I can't use it in my case, since I am training over the TPU, and this code gives a name error for some reason (issue here "AttributeError: Tensor.name is meaningless when eager execution is enabled." when training on TPU at "self.optimizer.apply_gradients" )

I tried a workaround where I also define _minimize in the class itself when I overrided the class

class CustomModel(tf.keras.Model):
    def __init__(self):
        super(CustomModel).__init__()
        
    def train_step(self, data):
        # Unpack the data. Its structure depends on your model and
        # on what you pass to `fit()`.
        x = data
        y = tf.constant([1.0], dtype=tf.float32)

        with tf.GradientTape() as tape:
            y_pred = self(x, training=True)  # Forward pass
            # Compute the loss value
            # (the loss function is configured in `compile()`)
            loss = self.compiled_loss(y, y_pred, regularization_losses=self.losses)
        
        self._minimize(self.distribute_strategy, tape, self.optimizer, loss,
                self.trainable_variables)

        self.compiled_metrics.update_state(y, y_pred, sample_weight)
        return {m.name: m.result() for m in self.metrics}

    def _minimize(strategy, tape, optimizer, loss, trainable_variables):
        with tape:
            if isinstance(optimizer, lso.LossScaleOptimizer):
                loss = optimizer.get_scaled_loss(loss)

        gradients = tape.gradient(loss, trainable_variables)
        gradients = [(ClipIfNotNone(grad)) for grad in gradients]
        gradients = [(ClipIfNotNone2(grad)) for grad in gradients]
        # Whether to aggregate gradients outside of optimizer. This requires support
        # of the optimizer and doesn't work with ParameterServerStrategy and
        # CentralStroageStrategy.
        aggregate_grads_outside_optimizer = (
            optimizer._HAS_AGGREGATE_GRAD and  # pylint: disable=protected-access
            not isinstance(strategy.extended,
                            parameter_server_strategy.ParameterServerStrategyExtended))

        if aggregate_grads_outside_optimizer:
            # We aggregate gradients before unscaling them, in case a subclass of
            # LossScaleOptimizer all-reduces in fp16. All-reducing in fp16 can only be
            # done on scaled gradients, not unscaled gradients, for numeric stability.
            gradients = optimizer._aggregate_gradients(zip(gradients,  # pylint: disable=protected-access
                                                        trainable_variables))
        if isinstance(optimizer, lso.LossScaleOptimizer):
            gradients = optimizer.get_unscaled_gradients(gradients)
        gradients = optimizer._clip_gradients(gradients)  # pylint: disable=protected-access
        if trainable_variables:
            if aggregate_grads_outside_optimizer:
                optimizer.apply_gradients(
                    zip(gradients, trainable_variables),
                    experimental_aggregate_gradients=False)
            else:
                optimizer.apply_gradients(zip(gradients, trainable_variables))

But then I get

TypeError: tf___minimize() takes 5 positional arguments but 6 were given

TPU: NameError: name '_minimize' is not defined when defining Keras custom train_step

Answers (1)

Related Questions

TPU: NameError: name &#39;_minimize&#39; is not defined when defining Keras custom train_step

Answers (1)

Related Questions

TPU: NameError: name '_minimize' is not defined when defining Keras custom train_step