Fast convolution algorithm have different outputs compared to regular convolution algorithm?

Question

I was trying to speed up a forward pass in a convolutional layer by using a technique of converting my image data to a column vector and essentially converting my convolution problem into a matrix multiplication problem.

[idea from https://sahnimanas.github.io/post/anatomy-of-a-high-performance-convolution/]

I did so by first implementing an im2col function from caffe's official github

void im2col_cpu(float* data_im, const int channels,
    const int height, const int width, const int kernel_h, const int kernel_w,
    const int pad,const int stride,const int dilation,float* data_col) {
  int dil_kernel_h = (kernel_h - 1) * dilation + 1;
  int dil_kernel_w = (kernel_w - 1) * dilation + 1;
  int height_col = (height + 2 * pad - dil_kernel_h) / stride + 1;
  int width_col = (width + 2 * pad - dil_kernel_w) / stride + 1;
  int channels_col = channels * kernel_h * kernel_w;
  
  #pragma omp parallel for
  for (int c = 0; c < channels_col; ++c) {
    int w_offset = c % kernel_w;
    int h_offset = (c / kernel_w) % kernel_h;
    int c_im = c / kernel_h / kernel_w;

    const int hc0 = h_offset * dilation - pad;
    const int wc0 = w_offset * dilation - pad;
    for (int h = 0; h < height_col; ++h) {
      int h_pad = h * stride + hc0;

      const int row_offset = (c * height_col + h) * width_col;
      const int srow_offset = (c_im * height + h_pad) * width;


      for (int w = 0; w < width_col; ++w) {
        int w_pad = w * stride + wc0;
        if ((h_pad < height) && (w_pad < width))
          *(data_col + row_offset + w) = *(data_im + srow_offset + w_pad);
      }
    }
  }
}

and then multiplying the output with a custom matrix multiplication code.

void mat_mul(float *A, float *B, float *C, int M, int N, int K, bool has_bias) {
  int i, j, k;

  if (!has_bias)  init(C, M, N); //init() converts C into a 0 matrix
  
  # pragma omp parallel for private(i, j, k)
  for (i = 0; i < M; ++i) {
    for (k = 0; k < K; ++k) {
      float * ptr_c = &C[i * N];
      float * ptr_b = &B[k * N];
      float * ptr_a = &A[i * K + k];
      for (j = 0; j < N; ++j) {
        *(ptr_c+j) += *ptr_a * *(ptr_b + j);
      }
      
    }
  }
}

Thus, my convolution code is as follows:

void Conv2d(Tensor input, Tensor weight, Tensor bias, Tensor output, int stride, int pad, int dilation, bool has_bias) {
  int C = input.shape[0], H = input.shape[1], W = input.shape[2];
  int K = weight.shape[0], R = weight.shape[2], S = weight.shape[3];
  int OH = output.shape[1], OW = output.shape[2];
  CHECK_ERROR(OH == (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1, "Output height mismatch");
  CHECK_ERROR(OW == (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1, "Output width mismatch");
  CHECK_ERROR(weight.shape[1] == C && (!has_bias || bias.shape[0] == K) && output.shape[0] == K, "Channel size mismatch");

  float* col = (float *)malloc(sizeof(float) * (C * R * S * H * W));
  im2col_cpu(input.buf, C, H, W, R, S, pad, stride, dilation, col);
  mat_mul(weight.buf, col, output.buf, K, OH * OW, R * S * C, has_bias);

  free(col);

}

However, it turns out that my code is not the same as a regular convolution using the standard very slow algorithm in that the output from using the matrix multiplication method did not match the output from using the below method.

void Conv2d(Tensor input, Tensor weight, Tensor bias, Tensor output, int stride, int pad, int dilation, bool has_bias) {
  int C = input.shape[0], H = input.shape[1], W = input.shape[2];
  int K = weight.shape[0], R = weight.shape[2], S = weight.shape[3];
  int OH = output.shape[1], OW = output.shape[2];
  CHECK_ERROR(OH == (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1, "Output height mismatch");
  CHECK_ERROR(OW == (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1, "Output width mismatch");
  CHECK_ERROR(weight.shape[1] == C && (!has_bias || bias.shape[0] == K) && output.shape[0] == K, "Channel size mismatch");

  for (int k = 0; k < K; ++k) {
    for (int oh = 0; oh < OH; ++oh) {
      for (int ow = 0; ow < OW; ++ow) {
        float o = has_bias ? bias.buf[k] : 0;
        for (int c = 0; c < C; ++c) {
          for (int r = 0; r < R; ++r) {
            for (int s = 0; s < S; ++s) {
              int h = oh * stride - pad + r * dilation;
              int w = ow * stride - pad + s * dilation;
              if (h < 0 || h >= H || w < 0 || w >= W) continue;
              float i = input.buf[c * H * W + h * W + w];
              float f = weight.buf[k * C * R * S + c * R * S + r * S + s];
              o += i * f;
            }
          }
        }
        output.buf[k * OH * OW + oh * OW + ow] = o;
      }
    }
  }
}

Any ideas on why my matrix multiplication code is not working?

Steve · Accepted Answer

Oh I found out what the problem was. In my original code, I set the bias as

float o = has_bias ? bias.buf[k] : 0;

where k represents the kth filter out of K filters. However, in my mat_mul code I naively thought that *(ptr_c+j) += *ptr_a * *(ptr_b + j); would add the right amount of bias to the final output.

I have changed my code to say instead:

void mat_mul(float *A, float *B, float *C, Tensor bias, int M, int N, int K, bool has_bias) {
  # pragma omp parallel for
  for (int i = 0; i < M; ++i) {
    int h_offset = i * N;
    for (int j = 0; j < N; ++j) {
        C[h_offset + j] = has_bias ? bias.buf[i] : 0;
    }
  }
  
  int i, j, k;
  # pragma omp parallel for private(i, j, k)
  for (i = 0; i < M; ++i) {
    for (k = 0; k < K; ++k) {
      int ptr_c = i * N;
      int ptr_b = k * N;
      int ptr_a = i * K + k;
      for (j = 0; j < N; ++j) {
        C[ptr_c+j] += A[ptr_a] * B[ptr_b + j];
      }
      
    }
  }
}

which would allow me to add the same amount of bias as my original code.

Fast convolution algorithm have different outputs compared to regular convolution algorithm?

Answers (1)

Related Questions