kim85
kim85

Reputation: 1

When I run the file, it starts processing and then stops, giving the following error

  1. I'm new to deep learning and I need help. The more detailed it is, the better. I have the following problem: In an attempt to train a model with three different languages, with Portuguese (as the fourth language) as the reference language, I'm getting the following error:
2024-07-31 17:34:52.770154: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-31 17:34:53.956581: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
C:\Users\myuser\PycharmProjects\.venv\Lib\site-packages\keras\src\layers\core\embedding.py:90: UserWarning: Argument `input_length` is deprecated. Just remove it.
  warnings.warn(
2024-07-31 17:35:09.997360: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Epoch 1/5
Traceback (most recent call last):
  File "C:\Users\myuser\PycharmProjects\.venv\Lib\site-packages\fasttext\align_sentences_ANN.py", line 112, in <module>
    main()
  File "C:\Users\myuser\PycharmProjects\.venv\Lib\site-packages\fasttext\align_sentences_ANN.py", line 99, in main
    trained_model = train_lstm_model(lstm_model, sentences, embeddings, tokenizer)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\myuser\PycharmProjects\.venv\Lib\site-packages\fasttext\align_sentences_ANN.py", line 43, in train_lstm_model
    model.fit(padded_sequences, np.array(embeddings), epochs=epochs)
  File "C:\Users\myuser\PycharmProjects\.venv\Lib\site-packages\keras\src\utils\traceback_utils.py", line 122, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "C:\Users\myuser\PycharmProjects\.venv\Lib\site-packages\keras\src\losses\losses.py", line 1286, in mean_squared_error
    return ops.mean(ops.square(y_true - y_pred), axis=-1)
                               ~~~~~~~^~~~~~~~
ValueError: Dimensions must be equal, but are 300 and 100 for '{{node compile_loss/mean_squared_error/sub}} = Sub[T=DT_FLOAT](data_1, sequential_1/dense_1/Add)' with input shapes: [?,300], [?,100]. 

The code being executed is as follows:

import os
import nltk
import numpy as np
import fasttext.util
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from sklearn.metrics.pairwise import cosine_similarity

#nltk.download('punkt')

# Functions for processing text and generating embeddings

def segment_sentences(file_path):
    """Segment text into sentences using NLTK."""
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    sentences = nltk.sent_tokenize(text)
    return sentences

def get_sentence_embedding(sentence, model):
    """Generate sentence embeddings using a FastText model."""
    words = sentence.split()
    embeddings = [model.get_word_vector(word) for word in words]
    return np.mean(embeddings, axis=0)

# Functions for neural network model creation and training

def create_lstm_model(vocab_size, embedding_dim=100, input_length=50):
    """Create and compile an LSTM neural network model."""
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length),
        LSTM(units=128, return_sequences=False),
        Dense(units=embedding_dim)
    ])
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

def train_lstm_model(model, sentences, embeddings, tokenizer, epochs=5):
    """Train the LSTM model on the given sentences and embeddings."""
    sequences = tokenizer.texts_to_sequences(sentences)
    padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=50)
    model.fit(padded_sequences, np.array(embeddings), epochs=epochs)
    return model

# Functions for sentence alignment and output

def align_sentences(embeddings1, embeddings2):
    """Align sentences based on cosine similarity of embeddings."""
    similarity_matrix = cosine_similarity(embeddings1, embeddings2)
    alignment = []
    for i in range(len(embeddings1)):
        j = similarity_matrix[i].argmax()
        alignment.append((i, j))
    return alignment

def combine_alignments(alignment12, alignment13, alignment14):
    """Combine pairwise alignments into a single alignment."""
    combined_alignment = []
    for i, j in alignment12:
        k = next(k for k, l in alignment13 if k == i)
        l = next(l for m, l in alignment14 if m == i)
        combined_alignment.append((i, j, k, l))
    return combined_alignment

def output_aligned_sentences(alignment, sentences1, sentences2, sentences3, sentences4):
    """Print aligned sentences."""
    for i, j, k, l in alignment:
        print(f"text1: {sentences1[i]}")
        print(f"text2: {sentences2[j]}")
        print(f"text3: {sentences3[k]}")# this is a pre-processed text in Portuguese.
        print(f"text4: {sentences4[l]}")
        print('-' * 80)

# Main function to execute the alignment process

def main():
    directory = 'C:/Users/myuser/PycharmProjects/.venv/Lib/site-packages/fasttext'
    files = ['text1.txt', 'text2.txt', 'text3.txt', 'text4.txt']
    file_paths = [os.path.join(directory, file) for file in files]
    
    sentences_list = [segment_sentences(file_path) for file_path in file_paths]
    
    # Load pre-trained FastText model for embedding generation
    model = fasttext.load_model('cc.pt.300.bin') 
    
    embeddings_list = [[get_sentence_embedding(sentence, model) for sentence in sentences] for sentences in sentences_list]
    
    # Prepare tokenizer for each text
    tokenizers = [tf.keras.preprocessing.text.Tokenizer() for _ in sentences_list]
    for tokenizer, sentences in zip(tokenizers, sentences_list):
        tokenizer.fit_on_texts(sentences)
    
    # Create and train an LSTM model for each text
    lstm_models = []
    for tokenizer, sentences, embeddings, file_name in zip(tokenizers, sentences_list, embeddings_list, files):
        vocab_size = len(tokenizer.word_index) + 1
        lstm_model = create_lstm_model(vocab_size)
        trained_model = train_lstm_model(lstm_model, sentences, embeddings, tokenizer)
        lstm_models.append(trained_model)
        trained_model.save(os.path.join(directory, f'lstm_model_{file_name.split(".")[0]}.h5'))
    
    alignment12 = align_sentences(embeddings_list[0], embeddings_list[1])
    alignment13 = align_sentences(embeddings_list[0], embeddings_list[2])
    alignment14 = align_sentences(embeddings_list[0], embeddings_list[3])
    
    final_alignment = combine_alignments(alignment12, alignment13, alignment14)
    
    output_aligned_sentences(final_alignment, *sentences_list)

if __name__ == "__main__":
    main()

Can anyone help me overcome this error?

Upvotes: 0

Views: 56

Answers (0)

Related Questions