starcitizenman
starcitizenman

Reputation: 1

Convolutional Autoencoder won't train data

I'm trying to create a convolutional Autoencoder that will work with seismic waveforms. The problem I have is that my model doesn't seem to be learning anything from the data, and when I test the model by comparing one waveform to the same, reconstructed version of it, I get a straight line. It should be outputting a similar waveform to the original:

original_vs_reconstructed_image (blue is original, orange is reconstructed)

In addition, the validation and test epochs don't change over time at all on my epoch graph.

I'm honestly not sure where to focus my debugging, but I suspect the problem lies with either the Autoencoder itself, the way my training function is structured, or the loss function I'm using (MSE).

I'm also fairly new to machine learning, so I might be missing something obvious or doing something totally wrong.

Here is my code that relates to this problem:

import glob
import numpy as np
import obspy as obs
import sklearn.model_selection
import torch
import torch.nn as nn
import torch.nn.functional
from torch.utils.data.sampler import SubsetRandomSampler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import random
import sys

files = glob.glob('/loggerhead/coke/wf_Tony/trim/15_62.5/1108/DH1' + '/*.mseed')
#  empty list to store the properly read waveforms
waves = []
#  read all the files
for f in files:
    temp_wave = obs.read(f)
    A = temp_wave[0].data
    # normalization
    B = A/np.max(np.abs(A))
    # ensures every wave is size 3126
    waves.append(np.pad(B, (0, 3126 - B.size), 'constant'))
wave_arr = np.vstack(waves)
train_arr, test_arr = sklearn.model_selection.train_test_split(wave_arr, train_size=0.95)
train_torch = torch.tensor(train_arr, requires_grad=True).clone()
test_torch = torch.tensor(test_arr, requires_grad=True).clone()

train_waves = train_torch.unsqueeze_(1)
test_waves = test_torch.unsqueeze_(1)

k = 7
p = k//2


class AutoEncoder(nn.Module):
    def __init__(self):
        #  make sure to always initialize the super class when using outside methods
        super().__init__()

        self.encoder = nn.Sequential(
            nn.Conv1d(1, 64, kernel_size=k, padding=p), nn.LeakyReLU(),
            nn.Conv1d(64, 64, kernel_size=k, padding=p), nn.LeakyReLU(), nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Conv1d(64, 128, kernel_size=k, padding=p), nn.LeakyReLU(), nn.Conv1d(128, 128, kernel_size=k, padding=p),
            nn.LeakyReLU(), nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Conv1d(128, 256, kernel_size=k, padding=p), nn.LeakyReLU(), nn.Conv1d(256, 256, kernel_size=k, padding=p),
            nn.LeakyReLU(), nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Conv1d(256, 512, kernel_size=k, padding=p), nn.LeakyReLU(), nn.Conv1d(512, 512, kernel_size=k, padding=p),
            nn.LeakyReLU(), nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Conv1d(512, 1024, kernel_size=k, padding=p), nn.LeakyReLU(), nn.Conv1d(1024, 1024, kernel_size=k, padding=p),
            nn.LeakyReLU()
        )
        self.decoder = nn.Sequential(
            nn.ConvTranspose1d(1024, 512, kernel_size=2, stride=2), nn.LeakyReLU(),
            nn.ConvTranspose1d(512, 256, kernel_size=2, stride=2), nn.LeakyReLU(),
            nn.ConvTranspose1d(256, 128, kernel_size=2, stride=2), nn.LeakyReLU(),
            nn.ConvTranspose1d(128, 64, kernel_size=2, stride=2), nn.LeakyReLU(),
            nn.Conv1d(64, 1, kernel_size=1, padding=p), nn.Tanh()
        )  

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
    return x

model = AutoEncoder()
loss_function_MSE = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)

# Check if the GPU is available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f'Selected device: {device}')

model.to(device)

# Training function
def train_epoch(model, device, loss_fn, optimizer):
    # Set train mode for both the encoder and the decoder
    model.train()
    train_loss = []
    train_tester = train_waves.clone().detach()
    # shuffle the training dataset
    train_tester = train_tester[torch.randperm(train_tester.size()[0])]
    for wave in train_tester:
        wave = wave.to(device)
        output_thing = model(wave)
        loss = loss_fn(output_thing, wave)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #   Print batch loss
        print('\t partial train loss (single batch): %f' % (loss.data))
        train_loss.append(loss.detach().cpu().numpy())

    return np.mean(train_loss)

# Testing function
def test_epoch(model, device, loss_fn):
    # Set evaluation mode for model
    model.eval()

    with torch.no_grad(): # No need to track the gradients
        # Define the lists to store the outputs for each batch
        conc_out = []
        conc_label = []
        for wave in test_waves:
            # Move tensor to the proper device
            wave = wave.to(device)
            # model data
            output_thing = model(wave)
            # Append the network output and the original image to the lists
            conc_out.append(output_thing.cpu())
            conc_label.append(wave.cpu())
        # Create a single tensor with all the values in the lists
        conc_out = torch.cat(conc_out)
        conc_label = torch.cat(conc_label)
        # Evaluate global loss
        val_loss = loss_fn(conc_out, conc_label)
    return val_loss.data

def plot_outputs(model):
    rand_num = random.randint(0, 4000)
    reconstructed = wave_torch_best[rand_num].to(device)
    reconstructed = model(reconstructed)
    new_numpy = reconstructed.detach().cpu().numpy()
    og = wave_torch_best[rand_num].detach().cpu().numpy()
    plt.plot(og[0, :])
    plt.plot(new_numpy[0, :])
    plt.savefig('/loggerhead/lwrigh89/Plots/Comparing Plots/reconstructed.png')

num_epochs = 4
diz_loss = {'train_loss':[],'val_loss':[]}
    for epoch in range(num_epochs):
        train_loss = train_epoch(model, device, loss_function_MSE, optimizer)
        val_loss = test_epoch(model, device, loss_function_MSE)
        print('\n EPOCH {}/{} \t train loss {} \t val loss {}'.format(epoch + 1, num_epochs, train_loss, val_loss))
        diz_loss['train_loss'].append(train_loss)
        diz_loss['val_loss'].append(val_loss)
        if epoch == num_epochs + 1:
            torch.save(model.state_dict(), '/loggerhead/lwrigh89/Model/newmodel.pt')
            # plot og vs reconstructed
            plot_outputs(model)
            plt.figure(figsize=(10, 8))
            plt.semilogy(diz_loss['train_loss'], label='Train')
            plt.semilogy(diz_loss['val_loss'], label='Valid')
            plt.xlabel('Epoch')
            plt.ylabel('Average Loss')
            plt.legend()
            plt.savefig('/loggerhead/lwrigh89/Plots/Epochs/epochgraph.png')
            # exit program
            sys.exit()

I'm using a GPU server with CUDA version 11.4, Python version 3.10.5, and PyTorch version 1.12.0.

I would appreciate any help/guidance given.

Upvotes: 0

Views: 230

Answers (1)

hac81acnh
hac81acnh

Reputation: 82

I tried your code(with a little change) and it seems to work well. I noticed several things from your post;

(1) I am afraid that if epoch == num_epochs + 1: is always "not-satisfied". Because if you write for epoch in range(num_epochs):, it means that the maximum value of epoch in the loop would be num_epochs - 1. So there is a possibility that your output '.png's are not updated. Please check the save time of the png files.

(2) I tried your code. Here is the result. I used the random data to train the model, so it does not mean that "the accuracy is high" or not, but at least, it is not "all outputs are zeros." I attached the code below. In the code, I added the comment ### changed where I changed your original code.

import glob
import numpy as np
import obspy as obs
import sklearn.model_selection
import torch
import torch.nn as nn
import torch.nn.functional
from torch.utils.data.sampler import SubsetRandomSampler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import random
import sys

# files = glob.glob('/loggerhead/coke/wf_Tony/trim/15_62.5/1108/DH1' + '/*.mseed')  ### changed
#  empty list to store the properly read waveforms
waves = []
#  read all the files
for f in range(4):#files:  ### changed
    #temp_wave = obs.read(f)  ### changed
    #A = temp_wave[0].data  ### changed
    A = np.random.random([1, 3126])  ### changed
    # normalization
    B = A/np.max(np.abs(A))
    # ensures every wave is size 3126
    waves.append(np.pad(B, (0, 3126 - B.size), 'constant'))
wave_arr = np.vstack(waves)
train_arr, test_arr = sklearn.model_selection.train_test_split(wave_arr, train_size=0.95)
train_torch = torch.tensor(train_arr, requires_grad=True).clone().float()
test_torch = torch.tensor(test_arr, requires_grad=True).clone().float()

train_waves = train_torch.unsqueeze_(1)
test_waves = test_torch.unsqueeze_(1)

k = 7
p = k//2


class AutoEncoder(nn.Module):
    def __init__(self):
        #  make sure to always initialize the super class when using outside methods
        super().__init__()

        self.encoder = nn.Sequential(
            nn.Conv1d(1, 64, kernel_size=k, padding=p), nn.LeakyReLU(),
            nn.Conv1d(64, 64, kernel_size=k, padding=p), nn.LeakyReLU(), nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Conv1d(64, 128, kernel_size=k, padding=p), nn.LeakyReLU(), nn.Conv1d(128, 128, kernel_size=k, padding=p),
            nn.LeakyReLU(), nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Conv1d(128, 256, kernel_size=k, padding=p), nn.LeakyReLU(), nn.Conv1d(256, 256, kernel_size=k, padding=p),
            nn.LeakyReLU(), nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Conv1d(256, 512, kernel_size=k, padding=p), nn.LeakyReLU(), nn.Conv1d(512, 512, kernel_size=k, padding=p),
            nn.LeakyReLU(), nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Conv1d(512, 1024, kernel_size=k, padding=p), nn.LeakyReLU(), nn.Conv1d(1024, 1024, kernel_size=k, padding=p),
            nn.LeakyReLU()
        )
        self.decoder = nn.Sequential(
            nn.ConvTranspose1d(1024, 512, kernel_size=2, stride=2), nn.LeakyReLU(),
            nn.ConvTranspose1d(512, 256, kernel_size=2, stride=2), nn.LeakyReLU(),
            nn.ConvTranspose1d(256, 128, kernel_size=2, stride=2), nn.LeakyReLU(),
            nn.ConvTranspose1d(128, 64, kernel_size=2, stride=2), nn.LeakyReLU(),
            nn.Conv1d(64, 1, kernel_size=1, padding=p), nn.Tanh()
        )  

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

model = AutoEncoder()
loss_function_MSE = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)

# Check if the GPU is available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f'Selected device: {device}')

model.to(device)

# Training function
def train_epoch(model, device, loss_fn, optimizer):
    # Set train mode for both the encoder and the decoder
    model.train()
    train_loss = []
    train_tester = train_waves.clone().detach()
    # shuffle the training dataset
    train_tester = train_tester[torch.randperm(train_tester.size()[0])]
    for wave in train_tester:
        wave = wave.to(device)
        output_thing = model(wave)
        loss = loss_fn(output_thing, wave)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #   Print batch loss
        print('\t partial train loss (single batch): %f' % (loss.data))
        train_loss.append(loss.detach().cpu().numpy())

    return np.mean(train_loss)

# Testing function
def test_epoch(model, device, loss_fn):
    # Set evaluation mode for model
    model.eval()

    with torch.no_grad(): # No need to track the gradients
        # Define the lists to store the outputs for each batch
        conc_out = []
        conc_label = []
        for wave in test_waves:
            # Move tensor to the proper device
            wave = wave.to(device)
            # model data
            output_thing = model(wave)
            # Append the network output and the original image to the lists
            conc_out.append(output_thing.cpu())
            conc_label.append(wave.cpu())
        # Create a single tensor with all the values in the lists
        conc_out = torch.cat(conc_out)
        conc_label = torch.cat(conc_label)
        # Evaluate global loss
        val_loss = loss_fn(conc_out, conc_label)
    return val_loss.data

def plot_outputs(model):
    rand_num = 0#random.randint(0, 4000)  ### changed
    wave_torch_best = torch.from_numpy(B).float().unsqueeze_(0)  ### changed
    reconstructed = wave_torch_best[rand_num].to(device)
    reconstructed = model(reconstructed)
    new_numpy = reconstructed.detach().cpu().numpy()
    og = wave_torch_best[rand_num].detach().cpu().numpy()
    plt.plot(og[0, :])
    plt.plot(new_numpy[0, :])
    plt.savefig('reconstructed.png')

num_epochs = 100  ### changed
diz_loss = {'train_loss':[],'val_loss':[]}
if(True):
    for epoch in range(num_epochs):
        train_loss = train_epoch(model, device, loss_function_MSE, optimizer)
        val_loss = test_epoch(model, device, loss_function_MSE)
        print('\n EPOCH {}/{} \t train loss {} \t val loss {}'.format(epoch + 1, num_epochs, train_loss, val_loss))
        diz_loss['train_loss'].append(train_loss)
        diz_loss['val_loss'].append(val_loss)
        if epoch == num_epochs - 1:  ### changed
            print('********')
            torch.save(model.state_dict(), 'newmodel.pt')
            # plot og vs reconstructed
            plot_outputs(model)
            plt.figure(figsize=(10, 8))
            plt.semilogy(diz_loss['train_loss'], label='Train')
            plt.semilogy(diz_loss['val_loss'], label='Valid')
            plt.xlabel('Epoch')
            plt.ylabel('Average Loss')
            plt.legend()
            plt.savefig('epochgraph.png')
            # exit program
            #sys.exit()  ### changed

Upvotes: 0

Related Questions