Johnson Z
Johnson Z

Reputation: 31

I want to use Numpy to simulate the inference process of a quantized MobileNet V2 network, but the outcome is different with pytorch realized one

Python version: 3.8

Pytorch version: 1.9.0+cpu

Platform: Anaconda Spyder5.0

To reproduce this problem, just copy every code below to a single file.

The ILSVRC2012_val_00000293.jpg file used in this code is shown below, you also need to download it and then change its destination in the code.

enter image description here

I am now working on a project that aims to develop a hardware accelerator to complete the inference process of the MobileNet V2 network. I used pretrained quantized Pytorch model to simulate the outcome, and the result comes out very well.

enter image description here classification outcome

In order to use hardware to complete this task, I wish to know every inputs and outputs as well as intermidiate variables during runing this piece of pytorch code. I used a package named torchextractor to fetch the outcomes of first layer, which in this case, is a 3*3 convolution layer.

import numpy as np
import torchvision
import torch
from torchvision import transforms, datasets
from PIL import Image
from torchvision import transforms
import torchextractor as tx
import math
#########################################################################################
##### Processing of input image
#########################################################################################

normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
test_transform = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        normalize,])

preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


#image file destination 
filename = "D:\Project_UM\MobileNet_VC709\MobileNet_pytorch\ILSVRC2012_val_00000293.jpg"
input_image = Image.open(filename)
input_tensor = preprocess(input_image)
input_batch = input_tensor.unsqueeze(0)
#########################################################################################
#########################################################################################
#########################################################################################

#----First verify that the torchextractor class should not influent the inference outcome 

# ofmp of layer1 before putting into torchextractor
a,b,c = quantize_tensor(input_batch)# to quantize the input tensor and return an int8 tensor, scale and zero point
input_qa = torch.quantize_per_tensor(torch.tensor(input_batch.clone().detach()), b, c, torch.quint8)# Using quantize_per_tensor method of torch

# Load a quantized mobilenet_v2 model
model_quantized = torchvision.models.quantization.mobilenet_v2(pretrained=True, quantize=True)

model_quantized.eval()
with torch.no_grad():
    output = model_quantized.features[0][0](input_qa)# Ofmp of layer1, datatype : quantized_tensor

# print("FM of layer1 before tx_extractor:\n",output.int_repr())# Ofmp of layer1, datatype : int8 tensor
output1_clone = output.int_repr().detach().numpy()# Clone ofmp of layer1, datatype : ndarray



#########################################################################################
#########################################################################################
#########################################################################################

# ofmp of layer1 after adding torchextractor
model_quantized_ex = tx.Extractor(model_quantized, ["features.0.0"])#Capture of the module inside first layer
model_output, features = model_quantized_ex(input_batch)# Forward propagation
# feature_shapes = {name: f.shape for name, f in features.items()}
# print(features['features.0.0']) # Ofmp of layer1, datatype : quantized_tensor
out1_clone = features['features.0.0'].int_repr().numpy() # Clone ofmp of layer1, datatype : ndarray


if(out1_clone.all() == output1_clone.all()):
    print('Model with torchextractor attached output the same value as the original model')
else:
    print('Torchextractor method influence the outcome')

Here I define a numpy quantization scheme based on the quantization scheme proposed by Quantization and Training of Neural Networks for Efficient Integer-Arithmetic-Only Inference

# Convert a normal regular tensor to a quantized tensor with scale and zero_point
def quantize_tensor(x, num_bits=8):# to quantize the input tensor and return an int8 tensor, scale and zero point

    qmin = 0.
    qmax = 2.**num_bits - 1.
    min_val, max_val = x.min(), x.max()

    scale = (max_val - min_val) / (qmax - qmin)

    initial_zero_point = qmin - min_val / scale

    zero_point = 0
    if initial_zero_point < qmin:
        zero_point = qmin
    elif initial_zero_point > qmax:
        zero_point = qmax
    else:
        zero_point = initial_zero_point

    # print(zero_point)
    zero_point = int(zero_point)
    q_x = zero_point + x / scale
    q_x.clamp_(qmin, qmax).round_()
    q_x = q_x.round().byte()
    return q_x, scale, zero_point

#%%
# #############################################################################################
# ---------  Simulate the inference process of layer0: conv33 using numpy
# #############################################################################################


# get the input_batch quantized buffer data
input_scale = b.item()
input_zero  = c
input_quantized = a[0].detach().numpy()

# get the layer0 output scale and zero_point
output_scale = model_quantized.features[0][0].state_dict()['scale'].item()
output_zero  = model_quantized.features[0][0].state_dict()['zero_point'].item()

# get the quantized weight with scale and zero_point  
weight_scale = model_quantized.features[0][0].state_dict()["weight"].q_scale()
weight_zero  = model_quantized.features[0][0].state_dict()["weight"].q_zero_point()
weight_quantized = model_quantized.features[0][0].state_dict()["weight"].int_repr().numpy()
# print(weight_quantized)
# print(weight_quantized.shape)


# bias_quantized,bias_scale,bias_zero= quantize_tensor(model_quantized.features[0][0].state_dict()["bias"])# to quantize the input tensor and return an int8 tensor, scale and zero point
# print(bias_quantized.shape)
bias = model_quantized.features[0][0].state_dict()["bias"].detach().numpy()
# print(input_quantized)
print(type(input_scale))
print(type(output_scale))
print(type(weight_scale))

Then I write a quantized 2D convolution using numpy, hope to figure out every details in pytorch data flow during the inference.

#%% numpy simulated layer0 convolution function define

def conv_cal(input_quantized, weight_quantized, kernel_size, stride, out_i, out_j, out_k):
    weight = weight_quantized[out_i]
    input = np.zeros((input_quantized.shape[0], kernel_size, kernel_size))
    for i in range(weight.shape[0]):
        for j in range(weight.shape[1]):
            for k in range(weight.shape[2]):
                input[i][j][k] = input_quantized[i][stride*out_j+j][stride*out_k+k]
    # print(np.dot(weight,input))
    # print(input,"\n")
    # print(weight)

    return np.multiply(weight,input).sum()

def QuantizedConv2D(input_scale, input_zero, input_quantized, output_scale, output_zero, weight_scale, weight_zero, weight_quantized, bias, kernel_size, stride, padding, ofm_size):
    output = np.zeros((weight_quantized.shape[0],ofm_size,ofm_size))

    input_quantized_padding = np.full((input_quantized.shape[0],input_quantized.shape[1]+2*padding,input_quantized.shape[2]+2*padding),0)
    zero_temp = np.full(input_quantized.shape,input_zero)
    input_quantized = input_quantized - zero_temp
    for i in range(input_quantized.shape[0]):
        for j in range(padding,padding + input_quantized.shape[1]):
            for k in range(padding,padding + input_quantized.shape[2]):
                input_quantized_padding[i][j][k] = input_quantized[i][j-padding][k-padding]

    zero_temp = np.full(weight_quantized.shape, weight_zero)
    weight_quantized = weight_quantized - zero_temp

    for i in range(output.shape[0]):
        for j in range(output.shape[1]):
            for k in range(output.shape[2]):
                # output[i][j][k] = (weight_scale*input_scale)*conv_cal(input_quantized_padding, weight_quantized, kernel_size, stride, i, j, k) + bias[i] #floating_output
                output[i][j][k] = weight_scale*input_scale/output_scale*conv_cal(input_quantized_padding, weight_quantized, kernel_size, stride, i, j, k) + bias[i]/output_scale + output_zero
                output[i][j][k] = round(output[i][j][k])
                # int_output
    return output  

Here I input the same image, weight, and bias together with their zero_point and scale, then compare this "numpy simulated" result to the PyTorch calculated one.

quantized_model_out1_int8 = np.squeeze(features['features.0.0'].int_repr().numpy())


print(quantized_model_out1_int8.shape)
print(quantized_model_out1_int8)
out1_np = QuantizedConv2D(input_scale, input_zero, input_quantized, output_scale, output_zero, weight_scale, weight_zero, weight_quantized, bias, 3, 2, 1, 112)
np.save("out1_np.npy",out1_np)

for i in range(quantized_model_out1_int8.shape[0]):
    for j in range(quantized_model_out1_int8.shape[1]):
        for k in range(quantized_model_out1_int8.shape[2]):
            if(out1_np[i][j][k] < 0):
                out1_np[i][j][k] = 0

print(out1_np)

flag = np.zeros(quantized_model_out1_int8.shape)
for i in range(quantized_model_out1_int8.shape[0]):
    for j in range(quantized_model_out1_int8.shape[1]):
        for k in range(quantized_model_out1_int8.shape[2]):

            if(quantized_model_out1_int8[i][j][k] == out1_np[i][j][k]):
                flag[i][j][k] = 1
                out1_np[i][j][k] = 0
                quantized_model_out1_int8[i][j][k] = 0

# Compare the simulated result to extractor fetched result, gain the total hit rate
print(flag.sum()/(112*112*32)*100,'%')

If the "numpy simulated" results are the same as the extracted one, call it a hit. Print the total hit rate, it shows that numpy gets 92% of the values right. Now the problem is, I have no idea why the rest 8% of values come out wrong.

hit rate

    #%% A test code to check the calculation process
    weight_quantized_sample = weight_quantized[2]
    M_t = input_scale * weight_scale / output_scale
    ifmap_t = np.int32(input_quantized[:,1:4,7:10])
    weight_t = np.int32(weight_quantized_sample)
    bias_t = bias[2]
    bias_q = bias_t/output_scale
    res_t = 0
    for ch in range(3):
        ifmap_offset = ifmap_t[ch]-np.int32(input_zero)
        weight_offset = weight_t[ch]-np.int32(weight_zero)
        res_ch = np.multiply(ifmap_offset, weight_offset)
        res_ch = res_ch.sum()
        res_t = res_t + res_ch
    res_mul = M_t*res_t
    # for n in range(1, 30):
    #     res_mul = multiply(n, M_t, res_t)
    res_t = round(res_mul + output_zero + bias_q)
    print(res_t)

Could you help me out of this, have been stuck here for a long time.

Upvotes: 2

Views: 758

Answers (1)

Roman Malashin
Roman Malashin

Reputation: 108

I implemented my own version of quantized convolution and got from 99.999% to 100% hitrate (and mismatch of a single value is by 1 that I can consider to be a rounding issue). The link on the paper in the question helped a lot.

But I found that your formulas are the same as mine. So I don't know what was your issue. As I understand quantization in pytorch is hardware dependent.

Here is my code:

def my_Conv2dRelu_b2(input_q, conv_layer, output_shape):
    '''

    Args:
        input_q: quantized tensor
        conv_layer: quantized tensor
        output_shape: the pre-computed shape of the result

    Returns:

    '''
    output = np.zeros(output_shape)

    # extract needed float numbers from quantized operations
    weights_scale = conv_layer.weight().q_per_channel_scales()
    input_scale = input_q.q_scale()
    weights_zp = conv_layer.weight().q_per_channel_zero_points()
    input_zp = input_q.q_zero_point()

    # extract needed convolution parameters
    padding = conv_layer.padding
    stride = conv_layer.stride

    # extract float numbers for results
    output_zp = conv_layer.zero_point
    output_scale = conv_layer.scale
    conv_weights_int = conv_layer.weight().int_repr()
    input_int = input_q.int_repr()

    biases = conv_layer.bias().numpy()
    for k in range(input_q.shape[0]):
        for i in range(conv_weights_int.shape[0]):
            output[k][i] = manual_convolution_quant(
                input_int[k].numpy(),
                conv_weights_int[i].numpy(),
                biases[i],
                padding=padding,
                stride=stride,
                image_zp=input_zp, image_scale=input_scale,
                kernel_zp=weights_zp[i].item(), kernel_scale=weights_scale[i].item(),
                result_zp=output_zp, result_scale=output_scale
            )
    return output


def manual_convolution_quant(image, kernel, b, padding, stride, image_zp, image_scale, kernel_zp, kernel_scale,
                             result_zp, result_scale):
    H = image.shape[1]
    W = image.shape[2]
    new_H = H // stride[0]
    new_W = W // stride[1]
    results = np.zeros([new_H, new_W])

    M = image_scale * kernel_scale / result_scale
    bias = b / result_scale
    paddedIm = np.pad(
        image,
        [(0, 0), (padding[0], padding[0]), (padding[1], padding[1])],
        mode="constant",
        constant_values=image_zp,
    )
    s = kernel.shape[1]
    for i in range(new_H):
        for j in range(new_W):
            patch = paddedIm[
                    :, i * stride[0]: i * stride[0] + s, j * stride[1]: j * stride[1] + s
                    ]
            res = M * ((kernel - kernel_zp) * (patch - image_zp)).sum() + result_zp + bias
            if res < 0:
                res = 0
            results[i, j] = round(res)

    return results

Code to compare pytorch and my own version.

def calc_hit_rate(array1, array2):
    good = (array1 == array2).astype(np.int).sum()
    all = array1.size
    return good / all


# during inference
y2 = model.conv1(y1)
y2_int = torch.int_repr(y2)

y2_int_manual = my_Conv2dRelu_b2(y1, model.conv1, y2.shape)
print(f'y2 hit rate= {calc_hit_rate(y2.int_repr().numpy(), y2_int_manual)}') #hit_rate=1.0

Upvotes: 1

Related Questions