Reputation: 11
I am implementing the "perceptual loss" function. But, PyTorch vs Tensorflow gives different results. I used the same images. Please, let me know why.
TensorFlow
class FeatureExtractor(tf.keras.Model):
def __init__(self, n_layers):
super(FeatureExtractor, self).__init__()
extractor = tf.keras.applications.VGG16(weights="imagenet",
include_top=False,input_shape=(256, 256, 3))
extractor.trainable = True
#features = [extractor.layers[i].output for i in n_layers]
features = [extractor.get_layer(i).output for i in n_layers]
self.extractor = tf.keras.models.Model(extractor.inputs,features)
def call(self, x):
return self.extractor(x)
def loss_function(generated_image, target_image,
feature_extractor):
MSE = tf.keras.losses.MeanSquaredError()
mse_loss = MSE(generated_image, target_image)
real_features = feature_extractor(target_image)
generated_features = feature_extractor(generated_image)
perceptual_loss = 0
for i in range(len(real_features)):
loss = MSE(real_features[i], generated_features[i])
print(loss)
perceptual_loss += loss
return mse_loss, perceptual_loss
Run:
feature_extractor = FeatureExtractor(n_layers=["block1_conv1","block1_conv2",
"block3_conv2","block4_conv2"])
mse_loss, perceptual_loss = loss_function(image1, image2,
feature_extractor)
print(f"{mse_loss} {perceptual_loss} {mse_loss+perceptual_loss}")
It gives:
output:
tf.Tensor(0.0014001362, shape=(), dtype=float32)
tf.Tensor(0.030578917, shape=(), dtype=float32)
tf.Tensor(2.6163354, shape=(), dtype=float32)
tf.Tensor(0.842701, shape=(), dtype=float32)
0.002584027126431465 3.4910154342651367 3.4935994148254395
Pytorch
class FeatureExtractor(torch.nn.Module):
def __init__(self, n_layers):
super(FeatureExtractor, self).__init__()
extractor = models.vgg16(pretrained=True).features
index = 0
self.layers = nn.ModuleList([])
for i in range(len(n_layers)):
self.layers.append(torch.nn.Sequential())
for j in range(index, n_layers[i] + 1):
self.layers[i].add_module(str(j), extractor[j])
index = n_layers[i] + 1
for param in self.parameters():
param.requires_grad = False
def forward(self, x):
result = []
for i in range(len(self.layers)):
x = self.layers[i](x)
result.append(x)
return result
def loss_function(generated_image, target_image, feature_extractor):
MSE = nn.MSELoss(reduction='mean')
mse_loss = MSE(generated_image, target_image)
real_features = feature_extractor(target_image)
generated_features = feature_extractor(generated_image)
perceptual_loss = 0
for i in range(len(real_features)):
loss = MSE(real_features[i], generated_features[i])
perceptual_loss += loss
print(loss)
return mse_loss, perceptual_loss
Run:
feature_extractor = FeatureExtractor(n_layers=[1, 3, 13, 20]).to(device)
mse_loss, perceptual_loss = loss_function(image1, image2,
feature_extractor)
print(f"{mse_loss} {perceptual_loss} {mse_loss+perceptual_loss}")
It gives:
output:
tensor(0.0003)
tensor(0.0029)
tensor(0.2467)
tensor(0.2311)
0.002584027359262109 0.4810013473033905 0.483585387468338
Upvotes: 1
Views: 1464
Reputation: 1298
Although they are the same models, the parameters of final model may be different because of different initialization parameters. For different frameworks like keras and pytorch, it's different to preprocess the input images before training. So the tenor value is different after processing even if they are same images. The following code is an example that could help understand.
from abc import ABC
import torch
import numpy as np
import tensorflow as tf
from torch import nn
from PIL import Image
from torch.autograd import Variable
import torchvision.models as models
import torchvision.transforms as transforms
from keras.preprocessing.image import load_img
from keras.applications.vgg16 import preprocess_input
from keras.preprocessing.image import img_to_array
# 'https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/1200px-Cat03.jpg'
IMG_URL1 = ' the local path of 1200px-Cat03.jpeg'
# 'https://upload.wikimedia.org/wikipedia/commons/b/bb/Kittyply_edit1.jpg'
IMG_URL2 = 'the local path of Kittyply_edit1.jpg'
# preprocess in keras
image1_tf = load_img(IMG_URL1, target_size=(224, 224))
image1_tf = img_to_array(image1_tf)
image1_tf = image1_tf.reshape((1, image1_tf.shape[0], image1_tf.shape[1], image1_tf.shape[2]))
image1_tf = preprocess_input(image1_tf)
image2_tf = load_img(IMG_URL2, target_size=(224, 224))
image2_tf = img_to_array(image2_tf)
image2_tf = image2_tf.reshape((1, image2_tf.shape[0], image2_tf.shape[1], image2_tf.shape[2]))
image2_tf = preprocess_input(image2_tf)
# preprocess in pytorch
image1_torch = Image.open(IMG_URL1)
image2_torch = Image.open(IMG_URL2)
image1_torch = image1_torch.resize((224, 224))
image2_torch = image2_torch.resize((224, 224))
min_img_size = 224
transform_pipeline = transforms.Compose([transforms.Resize(min_img_size),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])])
image1_torch = transform_pipeline(image1_torch)
image2_torch = transform_pipeline(image2_torch)
image1_torch = image1_torch.unsqueeze(0)
image2_torch = image2_torch.unsqueeze(0)
image1_torch = Variable(image1_torch)
image2_torch = Variable(image2_torch)
class FeatureExtractor(tf.keras.Model, ABC):
def __init__(self, n_layers):
super(FeatureExtractor, self).__init__()
extractor = tf.keras.applications.VGG16(weights="imagenet", input_shape=(224, 224, 3))
extractor.trainable = True
features = [extractor.get_layer(i).output for i in n_layers]
self.extractor = tf.keras.models.Model(extractor.inputs, features)
def call(self, x):
return self.extractor(x)
def loss_function(generated_image, target_image, feature_extractor):
MSE = tf.keras.losses.MeanSquaredError()
mse_loss = MSE(generated_image, target_image)
real_features = feature_extractor(target_image)
generated_features = feature_extractor(generated_image)
print("tf prediction:", np.argmax(generated_features[-1].numpy()[0]))
print("tf prediction:", np.argmax(real_features[-1].numpy()[0]))
perceptual_loss = 0
for i in range(len(real_features[:-1])):
loss = MSE(real_features[i], generated_features[i])
print(loss)
perceptual_loss += loss
return mse_loss, perceptual_loss
feature_extractor = FeatureExtractor(n_layers=["block1_conv1", "block1_conv2", "block3_conv2",
"block4_conv2", "predictions"])
print("tensorflow: ")
mse_loss, perceptual_loss = loss_function(image1_tf, image2_tf, feature_extractor)
print(f"{mse_loss} {perceptual_loss} {mse_loss + perceptual_loss}")
class FeatureExtractor1(torch.nn.Module):
def __init__(self, n_layers):
super(FeatureExtractor1, self).__init__()
self.vgg = models.vgg16(pretrained=True)
extractor = self.vgg.features
index = 0
self.layers = nn.ModuleList([])
for i in range(len(n_layers)):
self.layers.append(torch.nn.Sequential())
for j in range(index, n_layers[i] + 1):
self.layers[i].add_module(str(j), extractor[j])
index = n_layers[i] + 1
for param in self.parameters():
param.requires_grad = False
def forward(self, x):
result = []
predict = self.vgg(x)
for i in range(len(self.layers)):
x = self.layers[i](x)
result.append(x)
result.append(predict)
return result
def loss_function1(generated_image, target_image, feature_extractor):
MSE = nn.MSELoss(reduction='mean')
mse_loss = MSE(generated_image, target_image)
real_features = feature_extractor(target_image)
generated_features = feature_extractor(generated_image)
print("torch prediction:", np.argmax(generated_features[-1].numpy()[0]))
print("torch prediction:", np.argmax(real_features[-1].numpy()[0]))
perceptual_loss = 0
for i in range(len(real_features[:-1])):
loss = MSE(real_features[i], generated_features[i])
perceptual_loss += loss
print(loss)
return mse_loss, perceptual_loss
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
feature_extractor = FeatureExtractor1(n_layers=[1, 3, 13, 20]).to(device)
print("pytorch: ")
mse_loss, perceptual_loss = loss_function1(image1_torch, image2_torch, feature_extractor)
print(f"{mse_loss} {perceptual_loss} {mse_loss + perceptual_loss}")
In addition, the training goal of the model is accuracy of classification, so the difference results between feature maps in the middle of network would make sense.
Upvotes: 1