Moshel
Moshel

Reputation: 420

Simple model can't run on tpu (on colab)

I have problems running a very simple model using TPU on google colab. I have distilled it to a very simple program. I suspect it doesn't like the nested models (input_2?) but I have no idea how to solve this:

import numpy as np
import os

import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Activation, Dense, Multiply, Input
from tensorflow.keras import metrics

import warnings
warnings.filterwarnings("ignore")


class DataGenerator:
    def __init__(self):
        pass
    def create_train(self, dataset_info, batch_size, shape, augument=True):
        assert shape[2] == 3
        while True:
            random_indexes = np.random.choice(len(dataset_info), batch_size)
            batch_images1 = np.empty((batch_size, shape[0], shape[1], shape[2]))
            batch_labels = np.zeros((batch_size, 28))
            for i, idx in enumerate(random_indexes):
                image1= self.load_image(
                    dataset_info[idx]['path'], shape)
                batch_images1[i] = image1
                batch_labels[i][dataset_info[idx]['labels']] = 1
            yield batch_images1, batch_labels


    def load_image(self, path, shape):
        image1 = np.stack((
            np.ones((256,256)), 
            np.ones((256,256)), 
            np.ones((256,256)), 
            ), -1)
        return image1.astype(np.float)

train_datagen = DataGenerator()

train_dataset_info = []
for i in range(0, 1000):
    train_dataset_info.append({
        'path':str(i),
        'labels':np.array([5])})
train_dataset_info = np.array(train_dataset_info)

valid_dataset_info = []
for i in range(1000, 1200):
    valid_dataset_info.append({
        'path':str(i),
        'labels':np.array([6])})
valid_dataset_info = np.array(valid_dataset_info)
print(train_dataset_info.shape, valid_dataset_info.shape)

def create_model(input_shape, n_out):
    inp_mask = Input(shape=input_shape)
    pretrain_model_mask = ResNet50( input_shape = (256,256,3),
        include_top=False, 
        weights=None,    
        pooling='max')

    x = pretrain_model_mask(inp_mask)
    out = Dense(n_out, activation='sigmoid')(x)
    model = Model(inputs=inp_mask, outputs=[out])

    return model


tf.keras.backend.clear_session()

model = create_model(
    input_shape=(256,256,3), 
    n_out=28)

model.compile(
    loss='binary_crossentropy', 
    optimizer=tf.train.AdamOptimizer(learning_rate=1e-3, ),
    metrics=['acc'])

TPU_WORKER = 'grpc://' + os.environ['COLAB_TPU_ADDR']

tpu_model = tf.contrib.tpu.keras_to_tpu_model(
    model,
    strategy=tf.contrib.tpu.TPUDistributionStrategy(
        tf.contrib.cluster_resolver.TPUClusterResolver(TPU_WORKER)))

tpu_model.summary()


epochs = 4 ;batch_size = 64

# create train and valid datagens
train_generator = train_datagen.create_train(
    train_dataset_info, batch_size, (256,256,3))
validation_generator = train_datagen.create_train(
    valid_dataset_info, batch_size, (256,256,3))
# train model
history = tpu_model.fit_generator(
    train_generator,
    steps_per_epoch=1000,
    validation_data=validation_generator,
    validation_steps=20,
    epochs=epochs, 
    verbose=1)

This is the output of running it (just paste in in colab as single cell):

Epoch 1/4
INFO:tensorflow:New input shapes; (re-)compiling: mode=train (# of cores 8), [TensorSpec(shape=(8,), dtype=tf.int32, name='core_id0'), TensorSpec(shape=(8, 512, 512, 3), dtype=tf.float32, name='input_1_10'), TensorSpec(shape=(8, 28), dtype=tf.float32, name='dense_target_30')]
INFO:tensorflow:Overriding default placeholder.
INFO:tensorflow:Remapping placeholder for input_1
INFO:tensorflow:Remapping placeholder for input_2
INFO:tensorflow:Default: input_2
ERROR:tensorflow:Operation of type Placeholder (tpu_140454984405456_1/input_2) is not supported on the TPU. Execution will fail if this op is used in the graph. 
INFO:tensorflow:Started compiling
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-36-112706d24f9b> in <module>()
     61     validation_steps=len(valid_df)//batch_size,
     62     epochs=4,
---> 63     verbose=1,
     64 #    use_multiprocessing=False,
     65 #    callbacks=[checkpointer]

/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in fit_generator(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
   2175         use_multiprocessing=use_multiprocessing,
   2176         shuffle=shuffle,
-> 2177         initial_epoch=initial_epoch)
   2178 
   2179   def evaluate_generator(self,

/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training_generator.py in fit_generator(model, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
    174 
    175         outs = model.train_on_batch(
--> 176             x, y, sample_weight=sample_weight, class_weight=class_weight)
    177 
    178         if not isinstance(outs, list):

/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in train_on_batch(self, x, y, sample_weight, class_weight)
   1938 
   1939       self._make_train_function()
-> 1940       outputs = self.train_function(ins)
   1941 
   1942     if len(outputs) == 1:

/usr/local/lib/python3.6/dist-packages/tensorflow/contrib/tpu/python/tpu/keras_support.py in __call__(***failed resolving arguments***)
   1247     input_specs = infeed_instance.make_input_specs(input_tensors)
   1248     tpu_model_ops = self._tpu_model_ops_for_input_specs(input_specs,
-> 1249                                                         infeed_manager)
   1250     infeed_dict = infeed_instance.make_feed_dict(tpu_model_ops)
   1251 

/usr/local/lib/python3.6/dist-packages/tensorflow/contrib/tpu/python/tpu/keras_support.py in _tpu_model_ops_for_input_specs(self, input_specs, infeed_manager)
   1154                                                  infeed_manager)
   1155       self._compilation_cache[shape_key] = new_tpu_model_ops
-> 1156       self._test_model_compiles(new_tpu_model_ops)
   1157 
   1158     return self._compilation_cache[shape_key]

/usr/local/lib/python3.6/dist-packages/tensorflow/contrib/tpu/python/tpu/keras_support.py in _test_model_compiles(self, tpu_model_ops)
   1097     if proto.status_error_message:
   1098       raise RuntimeError('Compilation failed: {}'.format(
-> 1099           proto.status_error_message))
   1100 
   1101     end_time = time.time()

RuntimeError: Compilation failed: Compilation failure: Detected unsupported operations when trying to compile graph cluster_1_11838307395637379894[] on XLA_TPU_JIT: Placeholder (No registered 'Placeholder' OpKernel for XLA_TPU_JIT devices compatible with node {{node tpu_140454984405456_1/input_2}} = Placeholder[dtype=DT_FLOAT, shape=[?,512,512,3], _device="/device:TPU_REPLICATED_CORE"]()
    .  Registered:  device='TPU'
  device='CPU'
  device='GPU'
  device='XLA_GPU'
  device='XLA_CPU'
){{node tpu_140454984405456_1/input_2}}

for some reason stackoverflow insists that i;ll write some more details... there are none.

Upvotes: 2

Views: 2893

Answers (2)

SantoshGupta7
SantoshGupta7

Reputation: 6197

I believe fit_generator is not supported for TPUs https://github.com/tensorflow/tensorflow/issues/30162

Upvotes: 0

codebreach
codebreach

Reputation: 2220

Some operations are not supported on TPU. You can use tensorboard to check which part of the graph is not compatible. Then you can pin those operations to the CPU and it should work.

In your code it seems input_x is not TPU compatible. TPUs require constant shape and batch sizes.

Upvotes: 1

Related Questions