Reputation: 59
I am trying to train a model for binary classification. It is the sentiment analysis on tweets but the model prompts an error after epoch 1. Must be the size of the input but can't figure out exactly what input could be causing the problem. Any help is greatly appreciated.
Many thanks!
I have already tried many instances of different sizes and the problem continues,
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
df = pd.read_csv('twitter-sentiment-analysis2/train.csv',encoding='latin-1')
df.drop(['ItemID'], axis=1, inplace=True)
label=list(df.Sentiment)
text=list(df.SentimentText)
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',lower=True,split=' ')
tokenizer.fit_on_texts(text)
vocab = tokenizer.word_index
X_train, X_test, y_train, y_test = train_test_split(text, label, test_size=0.1,random_state=42)
X_train_word_ids = tokenizer.texts_to_sequences(X_train)
X_test_word_ids = tokenizer.texts_to_sequences(X_test)
x_train = pad_sequences(X_train_word_ids, maxlen=50)
x_test= pad_sequences(X_test_word_ids, maxlen=50)
glove_dir = 'glove6b100dtxt/'
embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))
embedding_dim = 100 #data comes from my GloVe
max_words=50
maxlen=50
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in vocab.items():
embedding_vector = embeddings_index.get(word)
if i < max_words:
if embedding_vector is not None:
# Words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])
history = model.fit(x_train, y_train,epochs=10,batch_size=32,validation_split=0.1,shuffle=True)
model.save_weights('pre_trained_glove_model.h5')
Could anybody give me some advise on where to look? Thanks again!
This is the error:
File "HM3.py", line 58, in <module>
history = model.fit(x_train, y_train,epochs=10,batch_size=32,validation_split=0.1,shuffle=True)
File "/usr/local/lib/python3.6/dist-packages/keras/engine/training.py", line 1039, in fit
validation_steps=validation_steps)
File "/usr/local/lib/python3.6/dist-packages/keras/engine/training_arrays.py", line 199, in fit_loop
outs = f(ins_batch)
File "/usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py", line 2715, in __call__
return self._call(inputs)
File "/usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py", line 2675, in _call
fetched = self._callable_fn(*array_vals)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 1439, in __call__
run_metadata_ptr)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/errors_impl.py", line 528, in __exit__
c_api.TF_GetCode(self.status.status))
tensorflow.python.framework.errors_impl.InvalidArgumentError: indices[26,39] = 31202 is not in [0, 50)
[[{{node embedding_1/embedding_lookup}} = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](embedding_1/embeddings/read, embedding_1/Cast, embedding_1/embedding_lookup/axis)]]
Upvotes: 0
Views: 57
Reputation: 14473
max_words=50
...
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
You create an Embedding that can only embed 50 different words, yet in your training data you index all occurring words. The error tells you that the word with index 31202 is not in found in the embedding of size [0, 50).
One solution would be to enlarge the Embedding input to cover all words occurring in the training set. Another would be to use a zero index with a zero embedding and remap all training words that have an index >= 50 to that zero.
Upvotes: 1