Reputation: 515
Attempting to get LinearClassifier running with Colab TPU. https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/estimator/LinearClassifier
TPUStrategy is supported in TensorFlow 2.0 Estimator(LinearClassifier) https://www.tensorflow.org/beta/guide/distribute_strategy#whats_supported_now_2
LinearClassifier works as expected without the tpu_strategy. https://www.tensorflow.org/beta/guide/distribute_strategy#tpustrategy
When adding the tpu_strategy as the config for LinearClassifier i'm getting the following error:
InvalidArgumentError: No OpKernel was registered to support Op 'TPUReplicatedInput' used by {{node input0}}with these attrs: [T=DT_DOUBLE, N=8] Registered devices: [CPU, XLA_CPU] Registered kernels: [[input0]] https://www.tensorflow.org/beta/guide/distribute_strategy#using_tfdistributestrategy_with_estimator
Have been fighting this for a few days now, what is wrong here?
!pip install tensorflow==2.0.0-beta0
import tensorflow.feature_column as fc
import tensorflow as tf
import os
print(tf.__version__)
TPU_WORKER = 'grpc://' + os.environ['COLAB_TPU_ADDR']
cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=TPU_WORKER)
tf.config.experimental_connect_to_host(cluster_resolver.master())
tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
tpu_strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver)
config = tf.estimator.RunConfig(train_distribute=tpu_strategy, eval_distribute=tpu_strategy)
batch_size = 1
def make_input_fn(X, y):
def input_fn():
dataset = tf.data.Dataset.from_tensor_slices((dict(X), y)).batch(batch_size)
return dataset
return input_fn
input_fn = make_input_fn(estimator_train_attributes_dictionary,labels_train)
linear_est = tf.estimator.LinearClassifier(feature_columns=attibute_columns,config=config)
linear_est.train(input_fn=input_fn)
full exception in Colab:
W0618 18:08:10.280844 140506166175616 estimator.py:1811] Using temporary folder as model directory: /tmp/tmp2xc1fixj
2.0.0-beta0
W0618 18:09:00.986362 140506166175616 tpu.py:218] 3 unsupported operations found:
ScalarSummary (bias)
ScalarSummary (fraction_of_zero_weights)
ScalarSummary (loss)
W0618 18:09:43.578035 140506166175616 tpu_strategy_util.py:57] TPU system %s has already been initialized. Reinitializing the TPU can cause previously created variables on TPU to be lost.
---------------------------------------------------------------------------
InvalidArgumentError Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1355 try:
-> 1356 return fn(*args)
1357 except errors.OpError as e:
20 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _run_fn(feed_dict, fetch_list, target_list, options, run_metadata)
1338 # Ensure any changes to the graph are reflected in the runtime.
-> 1339 self._extend_graph()
1340 return self._call_tf_sessionrun(
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _extend_graph(self)
1373 with self._graph._session_run_lock(): # pylint: disable=protected-access
-> 1374 tf_session.ExtendSession(self._session)
1375
InvalidArgumentError: No OpKernel was registered to support Op 'TPUReplicatedInput' used by {{node input0}}with these attrs: [T=DT_DOUBLE, N=8]
Registered devices: [CPU, XLA_CPU]
Registered kernels:
<no registered kernels>
[[input0]]
During handling of the above exception, another exception occurred:
InvalidArgumentError Traceback (most recent call last)
<ipython-input-23-66caf93d8677> in <module>()
25
26 linear_est = tf.estimator.LinearClassifier(feature_columns=attibute_columns,config=config)#feature_columns=featureNames,,config=config
---> 27 linear_est.train(input_fn=input_fn)#,max_steps=100
28
29 #train_spec = tf.estimator.TrainSpec(input_fn=input_fn, max_steps=1000)
/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py in train(self, input_fn, hooks, steps, max_steps, saving_listeners)
365
366 saving_listeners = _check_listeners_type(saving_listeners)
--> 367 loss = self._train_model(input_fn, hooks, saving_listeners)
368 logging.info('Loss for final step: %s.', loss)
369 return self
/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py in _train_model(self, input_fn, hooks, saving_listeners)
1154 def _train_model(self, input_fn, hooks, saving_listeners):
1155 if self._train_distribution:
-> 1156 return self._train_model_distributed(input_fn, hooks, saving_listeners)
1157 else:
1158 return self._train_model_default(input_fn, hooks, saving_listeners)
/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py in _train_model_distributed(self, input_fn, hooks, saving_listeners)
1217 self._config._train_distribute.configure(self._config.session_config)
1218 return self._actual_train_model_distributed(
-> 1219 self._config._train_distribute, input_fn, hooks, saving_listeners)
1220 # pylint: enable=protected-access
1221
/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py in _actual_train_model_distributed(self, strategy, input_fn, hooks, saving_listeners)
1327 return self._train_with_estimator_spec(estimator_spec, worker_hooks,
1328 hooks, global_step_tensor,
-> 1329 saving_listeners)
1330
1331 def _train_with_estimator_spec_distributed(self, estimator_spec, worker_hooks,
/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py in _train_with_estimator_spec(self, estimator_spec, worker_hooks, hooks, global_step_tensor, saving_listeners)
1478 save_summaries_steps=save_summary_steps,
1479 config=self._session_config,
-> 1480 log_step_count_steps=log_step_count_steps) as mon_sess:
1481 loss = None
1482 any_step_done = False
/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py in MonitoredTrainingSession(master, is_chief, checkpoint_dir, scaffold, hooks, chief_only_hooks, save_checkpoint_secs, save_summaries_steps, save_summaries_secs, config, stop_grace_period_secs, log_step_count_steps, max_wait_secs, save_checkpoint_steps, summary_dir)
582 session_creator=session_creator,
583 hooks=all_hooks,
--> 584 stop_grace_period_secs=stop_grace_period_secs)
585
586
/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py in __init__(self, session_creator, hooks, stop_grace_period_secs)
1005 hooks,
1006 should_recover=True,
-> 1007 stop_grace_period_secs=stop_grace_period_secs)
1008
1009
/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py in __init__(self, session_creator, hooks, should_recover, stop_grace_period_secs)
723 stop_grace_period_secs=stop_grace_period_secs)
724 if should_recover:
--> 725 self._sess = _RecoverableSession(self._coordinated_creator)
726 else:
727 self._sess = self._coordinated_creator.create_session()
/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py in __init__(self, sess_creator)
1198 """
1199 self._sess_creator = sess_creator
-> 1200 _WrappedSession.__init__(self, self._create_session())
1201
1202 def _create_session(self):
/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py in _create_session(self)
1203 while True:
1204 try:
-> 1205 return self._sess_creator.create_session()
1206 except _PREEMPTION_ERRORS as e:
1207 logging.info(
/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py in create_session(self)
869 """Creates a coordinated session."""
870 # Keep the tf_sess for unit testing.
--> 871 self.tf_sess = self._session_creator.create_session()
872 # We don't want coordinator to suppress any exception.
873 self.coord = coordinator.Coordinator(clean_stop_exception_types=[])
/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py in create_session(self)
645 init_op=self._scaffold.init_op,
646 init_feed_dict=self._scaffold.init_feed_dict,
--> 647 init_fn=self._scaffold.init_fn)
648
649
/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/session_manager.py in prepare_session(self, master, init_op, saver, checkpoint_dir, checkpoint_filename_with_path, wait_for_checkpoint, max_wait_secs, config, init_feed_dict, init_fn)
294 "init_fn or local_init_op was given")
295 if init_op is not None:
--> 296 sess.run(init_op, feed_dict=init_feed_dict)
297 if init_fn:
298 init_fn(sess)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
948 try:
949 result = self._run(None, fetches, feed_dict, options_ptr,
--> 950 run_metadata_ptr)
951 if run_metadata:
952 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
1171 if final_fetches or final_targets or (handle and feed_dict_tensor):
1172 results = self._do_run(handle, final_targets, final_fetches,
-> 1173 feed_dict_tensor, options, run_metadata)
1174 else:
1175 results = []
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
1348 if handle is None:
1349 return self._do_call(_run_fn, feeds, fetches, targets, options,
-> 1350 run_metadata)
1351 else:
1352 return self._do_call(_prun_fn, handle, feeds, fetches)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1368 pass
1369 message = error_interpolation.interpolate(message, self._graph)
-> 1370 raise type(e)(node_def, op, message)
1371
1372 def _extend_graph(self):
InvalidArgumentError: No OpKernel was registered to support Op 'TPUReplicatedInput' used by node input0 (defined at <ipython-input-23-66caf93d8677>:27) with these attrs: [T=DT_DOUBLE, N=8]
Registered devices: [CPU, XLA_CPU]
Registered kernels:
<no registered kernels>
[[input0]]
Upvotes: 2
Views: 417
Reputation: 322
Installing TF 2.0 in colab is supported (with the !pip
command), however, the TPUs themselves do not get the installed TF version. If you want to work with TPUs in colab, use the TF version that comes by default.
Upvotes: 1