Reputation: 1
2024-07-31 17:34:52.770154: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-31 17:34:53.956581: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
C:\Users\myuser\PycharmProjects\.venv\Lib\site-packages\keras\src\layers\core\embedding.py:90: UserWarning: Argument `input_length` is deprecated. Just remove it.
warnings.warn(
2024-07-31 17:35:09.997360: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Epoch 1/5
Traceback (most recent call last):
File "C:\Users\myuser\PycharmProjects\.venv\Lib\site-packages\fasttext\align_sentences_ANN.py", line 112, in <module>
main()
File "C:\Users\myuser\PycharmProjects\.venv\Lib\site-packages\fasttext\align_sentences_ANN.py", line 99, in main
trained_model = train_lstm_model(lstm_model, sentences, embeddings, tokenizer)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\myuser\PycharmProjects\.venv\Lib\site-packages\fasttext\align_sentences_ANN.py", line 43, in train_lstm_model
model.fit(padded_sequences, np.array(embeddings), epochs=epochs)
File "C:\Users\myuser\PycharmProjects\.venv\Lib\site-packages\keras\src\utils\traceback_utils.py", line 122, in error_handler
raise e.with_traceback(filtered_tb) from None
File "C:\Users\myuser\PycharmProjects\.venv\Lib\site-packages\keras\src\losses\losses.py", line 1286, in mean_squared_error
return ops.mean(ops.square(y_true - y_pred), axis=-1)
~~~~~~~^~~~~~~~
ValueError: Dimensions must be equal, but are 300 and 100 for '{{node compile_loss/mean_squared_error/sub}} = Sub[T=DT_FLOAT](data_1, sequential_1/dense_1/Add)' with input shapes: [?,300], [?,100].
The code being executed is as follows:
import os
import nltk
import numpy as np
import fasttext.util
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from sklearn.metrics.pairwise import cosine_similarity
#nltk.download('punkt')
# Functions for processing text and generating embeddings
def segment_sentences(file_path):
"""Segment text into sentences using NLTK."""
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
sentences = nltk.sent_tokenize(text)
return sentences
def get_sentence_embedding(sentence, model):
"""Generate sentence embeddings using a FastText model."""
words = sentence.split()
embeddings = [model.get_word_vector(word) for word in words]
return np.mean(embeddings, axis=0)
# Functions for neural network model creation and training
def create_lstm_model(vocab_size, embedding_dim=100, input_length=50):
"""Create and compile an LSTM neural network model."""
model = Sequential([
Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length),
LSTM(units=128, return_sequences=False),
Dense(units=embedding_dim)
])
model.compile(optimizer='adam', loss='mean_squared_error')
return model
def train_lstm_model(model, sentences, embeddings, tokenizer, epochs=5):
"""Train the LSTM model on the given sentences and embeddings."""
sequences = tokenizer.texts_to_sequences(sentences)
padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=50)
model.fit(padded_sequences, np.array(embeddings), epochs=epochs)
return model
# Functions for sentence alignment and output
def align_sentences(embeddings1, embeddings2):
"""Align sentences based on cosine similarity of embeddings."""
similarity_matrix = cosine_similarity(embeddings1, embeddings2)
alignment = []
for i in range(len(embeddings1)):
j = similarity_matrix[i].argmax()
alignment.append((i, j))
return alignment
def combine_alignments(alignment12, alignment13, alignment14):
"""Combine pairwise alignments into a single alignment."""
combined_alignment = []
for i, j in alignment12:
k = next(k for k, l in alignment13 if k == i)
l = next(l for m, l in alignment14 if m == i)
combined_alignment.append((i, j, k, l))
return combined_alignment
def output_aligned_sentences(alignment, sentences1, sentences2, sentences3, sentences4):
"""Print aligned sentences."""
for i, j, k, l in alignment:
print(f"text1: {sentences1[i]}")
print(f"text2: {sentences2[j]}")
print(f"text3: {sentences3[k]}")# this is a pre-processed text in Portuguese.
print(f"text4: {sentences4[l]}")
print('-' * 80)
# Main function to execute the alignment process
def main():
directory = 'C:/Users/myuser/PycharmProjects/.venv/Lib/site-packages/fasttext'
files = ['text1.txt', 'text2.txt', 'text3.txt', 'text4.txt']
file_paths = [os.path.join(directory, file) for file in files]
sentences_list = [segment_sentences(file_path) for file_path in file_paths]
# Load pre-trained FastText model for embedding generation
model = fasttext.load_model('cc.pt.300.bin')
embeddings_list = [[get_sentence_embedding(sentence, model) for sentence in sentences] for sentences in sentences_list]
# Prepare tokenizer for each text
tokenizers = [tf.keras.preprocessing.text.Tokenizer() for _ in sentences_list]
for tokenizer, sentences in zip(tokenizers, sentences_list):
tokenizer.fit_on_texts(sentences)
# Create and train an LSTM model for each text
lstm_models = []
for tokenizer, sentences, embeddings, file_name in zip(tokenizers, sentences_list, embeddings_list, files):
vocab_size = len(tokenizer.word_index) + 1
lstm_model = create_lstm_model(vocab_size)
trained_model = train_lstm_model(lstm_model, sentences, embeddings, tokenizer)
lstm_models.append(trained_model)
trained_model.save(os.path.join(directory, f'lstm_model_{file_name.split(".")[0]}.h5'))
alignment12 = align_sentences(embeddings_list[0], embeddings_list[1])
alignment13 = align_sentences(embeddings_list[0], embeddings_list[2])
alignment14 = align_sentences(embeddings_list[0], embeddings_list[3])
final_alignment = combine_alignments(alignment12, alignment13, alignment14)
output_aligned_sentences(final_alignment, *sentences_list)
if __name__ == "__main__":
main()
Can anyone help me overcome this error?
Upvotes: 0
Views: 56