Reputation: 11
I'm trying to implement a VQA model in which I'm combining an image and a language model. My model definition is:
def VQA_MODEL():
image_feature_size = 4096
word_feature_size = 300
number_of_LSTM = 3
number_of_hidden_units_LSTM = 512
max_length_questions = 30
number_of_dense_layers = 3
number_of_hidden_units = 1024
activation_function = 'tanh'
dropout_pct = 0.5
# Image model
model_image = Sequential()
model_image.add(Reshape((image_feature_size,), input_shape=(image_feature_size,)))
# Language Model
model_language = Sequential()
model_language.add(LSTM(number_of_hidden_units_LSTM, return_sequences=True,input_shape=(max_length_questions, word_feature_size)))
model_language.add(LSTM(number_of_hidden_units_LSTM, return_sequences=True))
model_language.add(LSTM(number_of_hidden_units_LSTM, return_sequences=False))
# combined model
model = Sequential()
model.add(concatenate([model_language.output, model_image.output]))
for _ in range(number_of_dense_layers):
model.add(Dense(number_of_hidden_units, kernel_initializer='uniform', activation= activation_function))
model.add(Dropout(dropout_pct))
model.add(Dense(50, activation='softmax'))
return model
model = VQA_MODEL()
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model.fit(train_X, train_Y, batch_size = batch_size, epochs=nb_epoch)
I can't seem to understand how to resolve the error. Any leads will be appreciated.
Upvotes: 0
Views: 164
Reputation: 1377
The error is due to the fact that concatenate
with small letter c
is not a layer
and only Concatenate
with capital letter c
is a layer. However, that will also not work in your case.
Since the your combined model is not sequential
and uses inputs from two parallel or different models, it's better to use the Functional
API. The following code should work:
from tensorflow.keras import Sequential, Model
from tensorflow.keras.layers import LSTM, Reshape, Dense, Dropout, concatenate, Concatenate
def VQA_MODEL():
image_feature_size = 4096
word_feature_size = 300
number_of_LSTM = 3
number_of_hidden_units_LSTM = 512
max_length_questions = 30
number_of_dense_layers = 3
number_of_hidden_units = 1024
activation_function = 'tanh'
dropout_pct = 0.5
# Image model
model_image = Sequential()
model_image.add(Reshape((image_feature_size,), input_shape=(image_feature_size,)))
# Language Model
model_language = Sequential()
model_language.add(LSTM(number_of_hidden_units_LSTM, return_sequences=True,input_shape=(max_length_questions, word_feature_size)))
model_language.add(LSTM(number_of_hidden_units_LSTM, return_sequences=True))
model_language.add(LSTM(number_of_hidden_units_LSTM, return_sequences=False))
# combined model
x = Concatenate()([model_language.output, model_image.output])
for _ in range(number_of_dense_layers):
x = Dense(number_of_hidden_units, kernel_initializer='uniform', activation= activation_function)(x)
x = Dropout(dropout_pct)(x)
x = Dense(50, activation='softmax')(x)
model = Model(inputs = [model_language.input, model_image.input], outputs=x)
return model
model = VQA_MODEL()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_X, train_Y, batch_size = batch_size, epochs=nb_epoch)
Upvotes: 1