Convolutional Autoencoder won't train data

Question

I'm trying to create a convolutional Autoencoder that will work with seismic waveforms. The problem I have is that my model doesn't seem to be learning anything from the data, and when I test the model by comparing one waveform to the same, reconstructed version of it, I get a straight line. It should be outputting a similar waveform to the original:

original_vs_reconstructed_image (blue is original, orange is reconstructed)

In addition, the validation and test epochs don't change over time at all on my epoch graph.

I'm honestly not sure where to focus my debugging, but I suspect the problem lies with either the Autoencoder itself, the way my training function is structured, or the loss function I'm using (MSE).

I'm also fairly new to machine learning, so I might be missing something obvious or doing something totally wrong.

Here is my code that relates to this problem:

import glob
import numpy as np
import obspy as obs
import sklearn.model_selection
import torch
import torch.nn as nn
import torch.nn.functional
from torch.utils.data.sampler import SubsetRandomSampler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import random
import sys

files = glob.glob('/loggerhead/coke/wf_Tony/trim/15_62.5/1108/DH1' + '/*.mseed')
#  empty list to store the properly read waveforms
waves = []
#  read all the files
for f in files:
    temp_wave = obs.read(f)
    A = temp_wave[0].data
    # normalization
    B = A/np.max(np.abs(A))
    # ensures every wave is size 3126
    waves.append(np.pad(B, (0, 3126 - B.size), 'constant'))
wave_arr = np.vstack(waves)
train_arr, test_arr = sklearn.model_selection.train_test_split(wave_arr, train_size=0.95)
train_torch = torch.tensor(train_arr, requires_grad=True).clone()
test_torch = torch.tensor(test_arr, requires_grad=True).clone()

train_waves = train_torch.unsqueeze_(1)
test_waves = test_torch.unsqueeze_(1)

k = 7
p = k//2


class AutoEncoder(nn.Module):
    def __init__(self):
        #  make sure to always initialize the super class when using outside methods
        super().__init__()

        self.encoder = nn.Sequential(
            nn.Conv1d(1, 64, kernel_size=k, padding=p), nn.LeakyReLU(),
            nn.Conv1d(64, 64, kernel_size=k, padding=p), nn.LeakyReLU(), nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Conv1d(64, 128, kernel_size=k, padding=p), nn.LeakyReLU(), nn.Conv1d(128, 128, kernel_size=k, padding=p),
            nn.LeakyReLU(), nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Conv1d(128, 256, kernel_size=k, padding=p), nn.LeakyReLU(), nn.Conv1d(256, 256, kernel_size=k, padding=p),
            nn.LeakyReLU(), nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Conv1d(256, 512, kernel_size=k, padding=p), nn.LeakyReLU(), nn.Conv1d(512, 512, kernel_size=k, padding=p),
            nn.LeakyReLU(), nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Conv1d(512, 1024, kernel_size=k, padding=p), nn.LeakyReLU(), nn.Conv1d(1024, 1024, kernel_size=k, padding=p),
            nn.LeakyReLU()
        )
        self.decoder = nn.Sequential(
            nn.ConvTranspose1d(1024, 512, kernel_size=2, stride=2), nn.LeakyReLU(),
            nn.ConvTranspose1d(512, 256, kernel_size=2, stride=2), nn.LeakyReLU(),
            nn.ConvTranspose1d(256, 128, kernel_size=2, stride=2), nn.LeakyReLU(),
            nn.ConvTranspose1d(128, 64, kernel_size=2, stride=2), nn.LeakyReLU(),
            nn.Conv1d(64, 1, kernel_size=1, padding=p), nn.Tanh()
        )  

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
    return x

model = AutoEncoder()
loss_function_MSE = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)

# Check if the GPU is available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f'Selected device: {device}')

model.to(device)

# Training function
def train_epoch(model, device, loss_fn, optimizer):
    # Set train mode for both the encoder and the decoder
    model.train()
    train_loss = []
    train_tester = train_waves.clone().detach()
    # shuffle the training dataset
    train_tester = train_tester[torch.randperm(train_tester.size()[0])]
    for wave in train_tester:
        wave = wave.to(device)
        output_thing = model(wave)
        loss = loss_fn(output_thing, wave)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #   Print batch loss
        print('	 partial train loss (single batch): %f' % (loss.data))
        train_loss.append(loss.detach().cpu().numpy())

    return np.mean(train_loss)

# Testing function
def test_epoch(model, device, loss_fn):
    # Set evaluation mode for model
    model.eval()

    with torch.no_grad(): # No need to track the gradients
        # Define the lists to store the outputs for each batch
        conc_out = []
        conc_label = []
        for wave in test_waves:
            # Move tensor to the proper device
            wave = wave.to(device)
            # model data
            output_thing = model(wave)
            # Append the network output and the original image to the lists
            conc_out.append(output_thing.cpu())
            conc_label.append(wave.cpu())
        # Create a single tensor with all the values in the lists
        conc_out = torch.cat(conc_out)
        conc_label = torch.cat(conc_label)
        # Evaluate global loss
        val_loss = loss_fn(conc_out, conc_label)
    return val_loss.data

def plot_outputs(model):
    rand_num = random.randint(0, 4000)
    reconstructed = wave_torch_best[rand_num].to(device)
    reconstructed = model(reconstructed)
    new_numpy = reconstructed.detach().cpu().numpy()
    og = wave_torch_best[rand_num].detach().cpu().numpy()
    plt.plot(og[0, :])
    plt.plot(new_numpy[0, :])
    plt.savefig('/loggerhead/lwrigh89/Plots/Comparing Plots/reconstructed.png')

num_epochs = 4
diz_loss = {'train_loss':[],'val_loss':[]}
    for epoch in range(num_epochs):
        train_loss = train_epoch(model, device, loss_function_MSE, optimizer)
        val_loss = test_epoch(model, device, loss_function_MSE)
        print('
 EPOCH {}/{} 	 train loss {} 	 val loss {}'.format(epoch + 1, num_epochs, train_loss, val_loss))
        diz_loss['train_loss'].append(train_loss)
        diz_loss['val_loss'].append(val_loss)
        if epoch == num_epochs + 1:
            torch.save(model.state_dict(), '/loggerhead/lwrigh89/Model/newmodel.pt')
            # plot og vs reconstructed
            plot_outputs(model)
            plt.figure(figsize=(10, 8))
            plt.semilogy(diz_loss['train_loss'], label='Train')
            plt.semilogy(diz_loss['val_loss'], label='Valid')
            plt.xlabel('Epoch')
            plt.ylabel('Average Loss')
            plt.legend()
            plt.savefig('/loggerhead/lwrigh89/Plots/Epochs/epochgraph.png')
            # exit program
            sys.exit()

I'm using a GPU server with CUDA version 11.4, Python version 3.10.5, and PyTorch version 1.12.0.

I would appreciate any help/guidance given.

Convolutional Autoencoder won't train data

Answers (1)

Related Questions

Convolutional Autoencoder won&#39;t train data

Answers (1)

Related Questions

Convolutional Autoencoder won't train data