1 #include "smaug/core/backend.h"
3 #include "smaug/operators/depthwise_convolution_op.h"
31 int input_size = img_num * img_chans * img_rows * (img_cols + img_pad);
32 int kernel_size = img_chans * k_rows * (k_cols + k_pad);
33 int result_size = img_num * img_chans * res_rows * (res_cols + res_pad);
34 dmaLoad(input, input, input_size *
sizeof(
float));
35 dmaLoad(kernels, kernels, kernel_size *
sizeof(
float));
38 const int start_i = 0;
39 const int start_j = 0;
40 const int end_i = img_rows - k_rows + 1;
41 const int end_j = img_cols - k_cols + 1;
43 ARRAY_4D(
float, _input, input, img_chans, img_rows, img_cols + img_pad);
44 ARRAY_3D(
float, _kernels, kernels, k_rows, k_cols + k_pad);
45 ARRAY_4D(
float, _result, result, img_chans, res_rows, res_cols + res_pad);
48 for (
int img = 0; img < img_num; img++) {
50 for (
int kern = 0; kern < img_chans; kern++) {
53 for (
int i = start_i; i < end_i; i += k_row_stride) {
56 for (
int j = start_j; j < end_j; j += k_col_stride) {
57 float partial_sum = 0;
59 for (
int k = 0; k < k_rows; k++) {
61 for (
int l = 0; l < k_cols; l++) {
62 int img_val = _input[img][kern][i + k][j + l];
63 int kern_val = _kernels[kern][k][l];
64 partial_sum += img_val * kern_val;
67 _result[img][kern][out_i][out_j] = partial_sum;
75 dmaStore(result, result, result_size *
sizeof(
float));
99 int input_size = img_num * img_chans * img_rows * (img_cols + img_pad);
100 int kernel_size = img_chans * k_rows * (k_cols + k_pad);
101 int result_size = img_num * img_chans * res_rows * (res_cols + res_pad);
102 dmaLoad(input, input, input_size *
sizeof(
float));
103 dmaLoad(kernels, kernels, kernel_size *
sizeof(
float));
105 const int total_row_pad = k_rows - 1;
106 const int total_col_pad = k_cols - 1;
107 const int left_pad = k_rows / 2;
108 const int right_pad = total_col_pad - left_pad;
109 const int top_pad = k_cols / 2;
110 const int bottom_pad = total_row_pad - top_pad;
113 const int start_i = -top_pad;
114 const int start_j = -left_pad;
115 const int end_i = img_rows - bottom_pad;
116 const int end_j = img_cols - right_pad;
118 ARRAY_4D(
float, _input, input, img_chans, img_rows, img_cols + img_pad);
119 ARRAY_3D(
float, _kernels, kernels, k_rows, k_cols + k_pad);
120 ARRAY_4D(
float, _result, result, img_chans, res_rows, res_cols + res_pad);
123 for (
int img = 0; img < img_num; img++) {
125 for (
int kern = 0; kern < img_chans; kern++) {
128 for (
int i = start_i; i < end_i; i += k_row_stride) {
131 for (
int j = start_j; j < end_j; j += k_col_stride) {
132 float partial_sum = 0;
134 for (
int k = 0; k < k_rows; k++) {
136 (i + k) >= 0 && (i + k) < img_rows;
138 for (
int l = 0; l < k_cols; l++) {
140 (j + l) >= 0 && (j + l) < img_cols;
141 float img_val = rowInBounds && colInBounds
142 ? _input[img][kern][i + k][j + l]
144 float kern_val = rowInBounds && colInBounds
145 ? _kernels[kern][k][l]
147 partial_sum += img_val * kern_val;
150 _result[img][kern][out_i][out_j] = partial_sum;
158 dmaStore(result, result, result_size *
sizeof(
float));
168 void DepthwiseConvolutionOp<ReferenceBackend>::run() {
169 auto input = getInput(Inputs);
170 auto kernels = getInput(Kernels);
171 auto output = getOutput(Outputs);
172 const TensorShape& inputShape = input->getShape();
173 const TensorShape& kernelShape = kernels->getShape();
174 const TensorShape& outputShape = output->getShape();
175 assert(inputShape.getLayout() == DataLayout::NCHW);
176 assert(kernelShape.getLayout() == DataLayout::NCHW);
177 assert(outputShape.getLayout() == DataLayout::NCHW);
179 float* inputData = input->data<
float>();
180 float* kernelData = kernels->data<
float>();
181 float* outputData = output->data<
float>();
183 inputShape.storageSize() *
sizeof(
float));
185 kernelShape.storageSize() *
sizeof(
float));
187 outputShape.storageSize() *
sizeof(
float));
190 invokeKernel(ref::kConvolutionHw, func, inputData, kernelData, outputData,
191 inputShape[0], inputShape[1], inputShape[2], inputShape[3],
192 inputShape.getPadding(3), kernelShape[2], kernelShape[3],
193 kernelShape.getPadding(3), getRowStride(), getColStride(),
194 outputShape[2], outputShape[3], outputShape.getPadding(3));