Reputation: 3
I am trying to run the following main.py file and I continuously get the error "InvalidArgumentError (see above for traceback): indices[138,4] = 23 is not in [0, 23)". I have checked my vocab file. It has exactly 23 words in it. The code works fine for a single line of new data inserted but when the data is continuous or more then this error pops out. Please help me to rectify this issue. Below is a small snippet of my code . The line "word_embeddings = tf.nn.embedding_lookup(variable, word_ids)" is where the error comes.
def model_fn(features, labels, mode, params):
# For serving features are a bit different
if isinstance(features, dict):
features = ((features['words'], features['nwords']),
(features['chars'], features['nchars']))
# Read vocabs and inputs
(words, nwords), (chars, nchars) = features
dropout = params['dropout']
training = (mode == tf.estimator.ModeKeys.TRAIN)
vocab_words = tf.contrib.lookup.index_table_from_file(
params['words'], num_oov_buckets=params['num_oov_buckets'])
vocab_chars = tf.contrib.lookup.index_table_from_file(
params['chars'], num_oov_buckets=params['num_oov_buckets'])
with Path(params['tags']).open() as f:
indices = [idx for idx, tag in enumerate(f) if tag.strip() != 'O']
num_tags = len(indices) + 1
with Path(params['chars']).open() as f:
num_chars = sum(1 for _ in f) + params['num_oov_buckets']
# Char Embeddings
char_ids = vocab_chars.lookup(chars)
variable = tf.get_variable(
'chars_embeddings', [num_chars, params['dim_chars']], tf.float32)
char_embeddings = tf.nn.embedding_lookup(variable, char_ids)
char_embeddings = tf.layers.dropout(char_embeddings, rate=dropout,
training=training)
# Char LSTM
dim_words = tf.shape(char_embeddings)[1]
dim_chars = tf.shape(char_embeddings)[2]
flat = tf.reshape(char_embeddings, [-1, dim_chars, params['dim_chars']])
t = tf.transpose(flat, perm=[1, 0, 2])
lstm_cell_fw = tf.contrib.rnn.LSTMBlockFusedCell(params['char_lstm_size'])
lstm_cell_bw = tf.contrib.rnn.LSTMBlockFusedCell(params['char_lstm_size'])
lstm_cell_bw = tf.contrib.rnn.TimeReversedFusedRNN(lstm_cell_bw)
_, (_, output_fw) = lstm_cell_fw(t, dtype=tf.float32,
sequence_length=tf.reshape(nchars, [-1]))
_, (_, output_bw) = lstm_cell_bw(t, dtype=tf.float32,
sequence_length=tf.reshape(nchars, [-1]))
output = tf.concat([output_fw, output_bw], axis=-1)
char_embeddings = tf.reshape(output, [-1, dim_words, 50])
# Word Embeddings
word_ids = vocab_words.lookup(words)
glove = np.load(params['glove'])['embeddings'] # np.array
variable = np.vstack([glove, [[0.] * params['dim']]])
variable = tf.Variable(variable, dtype=tf.float32, trainable=False)
word_embeddings = tf.nn.embedding_lookup(variable, word_ids)
# Concatenate Word and Char Embeddings
embeddings = tf.concat([word_embeddings, char_embeddings], axis=-1)
embeddings = tf.layers.dropout(embeddings, rate=dropout, training=training)
# LSTM
t = tf.transpose(embeddings, perm=[1, 0, 2]) # Need time-major
lstm_cell_fw = tf.contrib.rnn.LSTMBlockFusedCell(params['lstm_size'])
lstm_cell_bw = tf.contrib.rnn.LSTMBlockFusedCell(params['lstm_size'])
lstm_cell_bw = tf.contrib.rnn.TimeReversedFusedRNN(lstm_cell_bw)
output_fw, _ = lstm_cell_fw(t, dtype=tf.float32, sequence_length=nwords)
output_bw, _ = lstm_cell_bw(t, dtype=tf.float32, sequence_length=nwords)
output = tf.concat([output_fw, output_bw], axis=-1)
output = tf.transpose(output, perm=[1, 0, 2])
output = tf.layers.dropout(output, rate=dropout, training=training)
# CRF
logits = tf.layers.dense(output, num_tags)
crf_params = tf.get_variable("crf", [num_tags, num_tags], dtype=tf.float32)
pred_ids, _ = tf.contrib.crf.crf_decode(logits, crf_params, nwords)
if mode == tf.estimator.ModeKeys.PREDICT:
# Predictions
reverse_vocab_tags = tf.contrib.lookup.index_to_string_table_from_file(
params['tags'])
pred_strings = reverse_vocab_tags.lookup(tf.to_int64(pred_ids))
predictions = {
'pred_ids': pred_ids,
'tags': pred_strings
}
return tf.estimator.EstimatorSpec(mode, predictions=predictions)
else:
# Loss
vocab_tags = tf.contrib.lookup.index_table_from_file(params['tags'])
tags = vocab_tags.lookup(labels)
log_likelihood, _ = tf.contrib.crf.crf_log_likelihood(
logits, tags, nwords, crf_params)
loss = tf.reduce_mean(-log_likelihood)
# Metrics
weights = tf.sequence_mask(nwords)
metrics = {
'acc': tf.metrics.accuracy(tags, pred_ids, weights),
'precision': precision(tags, pred_ids, num_tags, indices, weights),
'recall': recall(tags, pred_ids, num_tags, indices, weights),
'f1': f1(tags, pred_ids, num_tags, indices, weights),
}
for metric_name, op in metrics.items():
tf.summary.scalar(metric_name, op[1])
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(
mode, loss=loss, eval_metric_ops=metrics)
elif mode == tf.estimator.ModeKeys.TRAIN:
train_op = tf.train.AdamOptimizer().minimize(
loss, global_step=tf.train.get_or_create_global_step())
return tf.estimator.EstimatorSpec(
mode, loss=loss, train_op=train_op)
if __name__ == '__main__':
# Params
params = {
'dim': 300,
'dim_chars': 100,
'dropout': 0.5,
'num_oov_buckets': 1,
'epochs': 25,
'batch_size': 20,
'buffer': 30000000,
'char_lstm_size': 25,
'lstm_size': 100,
'words': str(Path(DATADIR, 'vocab.words.txt')),
'chars': str(Path(DATADIR, 'vocab.chars.txt')),
'tags': str(Path(DATADIR, 'vocab.tags.txt')),
'glove': str(Path(DATADIR, 'glove.npz'))
}
with Path('results1/params.json').open('w') as f:
json.dump(params, f, indent=4, sort_keys=True)
# Word Embeddings
word_ids = vocab_words.lookup(words)
glove = np.load(params['glove'])['embeddings'] # np.array
variable = np.vstack([glove, [[0.] * params['dim']]])
variable = tf.Variable(variable, dtype=tf.float32, trainable=False)
word_embeddings = tf.nn.embedding_lookup(variable, word_ids)
Upvotes: 0
Views: 488
Reputation: 11
Hope this is not too late for you. I have been googling this issue for a while, hopefully got the root of it and turns out it was quite simple. Similar issues unsolved were here and here.
Chances are: You have seen an example of this embeddings code somewhere and tried to follow it (this was the case for me). However, the case is that coders and tensorflow assume that the id's for the inputs are sequential. I.e. that if you have 1000 items for example, then your id's are [0,1,2,3..998,999]. However, this is usually not the case with real data where id's are something like "xYzVryCmplxNm5m3r" (in this case, it will give and error because there are characters in the id and tensorflow will not accept that, it only accepts integers), or, in the very subtle case that is probably your case, the id's are actually integers but not sequential. For example, they can go like : ids=[68632548, 15323, ....]. In this case, tensorflow will accept the input data (because it's integers as expected) and give you this error, because the numbers are not sequential and actually much larger than the number of unique id's (this number+1 is usually set to be the limit for the vocab size).
The solution that worked for me was to map all the id values in the original dataframe to sequential id's, preserving their uniqueness, and then input the same data again (it actually worked !).
The code could be something like:
unique_ids=np.unique(old_ids)
sqeuential_ids=[i for i in range(len(unique_ids))]
id_mapping_dict=dict(zip(unique_ids,sqeuential_ids))
def map_ids_to_sequential(original_id):
return id_mapping_dict[original_id]
df['ids']=df['ids'].apply(map_ids_to_sequential)
Upvotes: 1