Problem of exporting batchnorm weight from pytorch to Keras

Question

I follow Pytorch Batchnorm layer different from Keras Batchnorm, Pytorch Batchnorm implementation, but they do not solve my problem.

I also read Wiki about Batchnorm. And search source code from tensorflow batchnorm and from pytorch source code.

Below is my testing code, and the results between pytorch and keras are different in 1e-2 to 1e-3 order of error which is big. Function b0, b1 similar to torch result but still not quite accurate. And b2 tries to follow the formula used in tensorflow batchnorm.

Convolution part yields the same result, but I am stucking at batchnorm layer. I also use eval(), no_grad() for pytorch and model.predict for keras model to make sure they are in inference stage.

Tensorflow implementation does not use 1/sqrt(var+eps), but sqrt(var+eps) instead. I try to transfer 1/running_var to keras.BN.moving_var but still fail the result.

import tensorflow as tf
import tensorflow.keras.layers as L
from tensorflow.keras import Model as KModel
import torch.nn as nn
import torch

def KM():
    x = L.Input((None,None,3))
    y0 = L.Concatenate(axis=-1)([x[:,::2,::2,:],x[:,::2,1::2,:],x[:,1::2,::2,:],x[:,1::2,1::2,:]])
    y1 = L.Conv2D(32,3,1,"same",use_bias=False)(y0)
    y2 = L.BatchNormalization()(y1)
    y3 = L.LeakyReLU(0.1)(y2)
    return KModel(x, [y1, y2, y3])

class YM(nn.Module):
    def __init__(self):
        super(YM, self).__init__()
        self.cat = lambda x : torch.cat([x[:,:,::2,::2],x[:,:,::2,1::2],x[:,:,1::2,::2],x[:,:,1::2,1::2]],axis=1)
        self.conv = nn.Conv2d(12,32,3,1,1,bias=False)
        self.bn = nn.BatchNorm2d(32)
        self.act = nn.LeakyReLU(0.1)

    def forward(self, x):
        y0 = ym.cat(x)
        y0 = ym.conv(y0)
        y1 = ym.bn(y0)
        y2 = ym.act(y1)
        return [y0, y1, y2]

np.random.seed(0)
img = np.random.randint(0,255,(1,12,14,3)).astype(np.float32)
img_torch = torch.from_numpy(img.transpose(0,3,1,2).astype(np.float32))
w1 = np.random.rand(32,12,3,3).astype(np.float32)*0.1
bw1 = np.random.rand(32).astype(np.float32)*0.1
bb1 = np.random.rand(32).astype(np.float32)
bm1 = np.random.rand(32).astype(np.float32)
bv1 = np.abs(np.random.rand(32).astype(np.float32))*0.1

ym = YM()
km = KM()

ym.conv.weight = nn.Parameter(torch.from_numpy(w1))
ym.bn.weight = nn.Parameter(torch.from_numpy(bw1))
ym.bn.bias = nn.Parameter(torch.from_numpy(bb1))
ym.bn.running_mean = torch.from_numpy(bm1)
ym.bn.running_var = torch.from_numpy(bv1)

km.layers[6].set_weights([w1.transpose(2,3,1,0)])
km.layers[7].set_weights([bw1, bb1, bm1, bv1])

ym.eval()
ym.bn.track_running_stats = True
with torch.no_grad():
    t0 = Ym(ym, img_torch/255.-0.5)
k0 = km.predict(img/255.-0.5)

for i in range(len(t0)):
    print(t0[i].shape, k0[i].shape)

Key = 1
print(t0[Key][0,0,:,:].detach().numpy())
print(k0[Key][0,:,:,0])

>>>>>>>>>>>
[[    0.71826     0.72964     0.73189     0.70224     0.74954     0.72928      0.7524]
 [    0.71305     0.68717     0.68581      0.7242     0.73491     0.71925     0.70781]
 [    0.70145     0.66769      0.6857     0.70804     0.73533     0.73165     0.72006]
 [     0.6758     0.69231     0.71173     0.71325     0.72097     0.71414     0.75782]
 [    0.68255     0.72283     0.71273      0.7226     0.71788     0.68119     0.72556]
 [    0.70452     0.68088     0.74389     0.73558     0.72853      0.7174     0.74389]]
[[    0.71953     0.73082     0.73306     0.70365     0.75056     0.73046     0.75339]
 [    0.71437      0.6887     0.68736     0.72543     0.73605     0.72052     0.70918]
 [    0.70287     0.66939     0.68724      0.7094     0.73647     0.73282     0.72133]
 [    0.67743      0.6938     0.71306     0.71457     0.72223     0.71545     0.75877]
 [    0.68413     0.72407     0.71405     0.72384     0.71916     0.68278     0.72678]
 [    0.70592     0.68246     0.74495     0.73671     0.72972     0.71868     0.74496]]```

tt = t0[Key].detach().numpy().transpose(0,2,3,1)
kk = k0[Key]
np.abs(tt-kk).max()
>>>>>>>>>>
0.078752756

gamma, beta = bw1[0], bb1[0]
mu, var = bm1[0], bv1[0]
x_p = t0[0][0,0,0,0]

print(gamma,beta,mu,var,x_p)

eps = 1e-10
def bn0(x_p, mu, var, gamma, beta):
    # wiki
    xhat = (x_p - mu)/np.sqrt(var + eps)
    _x = xhat * gamma + beta
    return _x

def bn1(x_p, mu, var, gamma, beta):
    # pytorch cpp
    inv_var = 1/ np.sqrt(var + eps)
    alpha_d = gamma * inv_var
    beta_d = beta - mu * inv_var * gamma
    return x_p * alpha_d + beta_d

def bn2(x_p, mu, var, gamma, beta):
    # tensorflow cpp
    inv_var = np.sqrt(var + eps)
    xhat = (x_p - mu)*inv_var
    _x = xhat * gamma + beta    
    return _x
print(bn0(x_p, mu, var, gamma, beta))
print(bn1(x_p, mu, var, gamma, beta))
print(bn2(x_p, mu, var, gamma, beta))
print(bn2(x_p, mu, 1/var, gamma, beta))

>>>>>>>>
0.048011426 0.87305844 0.67954195 0.059197646 tensor(-0.26256)
tensor(0.68715)
tensor(0.68715)
tensor(0.86205)
tensor(0.68715)

Problem of exporting batchnorm weight from pytorch to Keras

Answers (1)

Related Questions