1 #include "smaug/core/backend.h"
3 #include "smaug/operators/convolution_op.h"
4 #include "smaug/operators/ref/ref_activation_fun_op.h"
5 #include "smaug/utility/debug_stream.h"
35 int input_size = img_num * img_chans * img_rows * (img_cols + img_pad);
36 int kernel_size = k_num * img_chans * k_rows * (k_cols + k_pad);
37 int result_size = img_num * k_num * res_rows * (res_cols + res_pad);
38 dmaLoad(input, input, input_size *
sizeof(
float));
39 dmaLoad(kernels, kernels, kernel_size *
sizeof(
float));
42 const int start_i = 0;
43 const int start_j = 0;
44 const int end_i = img_rows - k_rows + 1;
45 const int end_j = img_cols - k_cols + 1;
47 ARRAY_4D(
float, _input, input, img_chans, img_rows, img_cols + img_pad);
48 ARRAY_4D(
float, _kernels, kernels, img_chans, k_rows, k_cols + k_pad);
49 ARRAY_4D(
float, _result, result, k_num, res_rows, res_cols + res_pad);
52 for (
int img = 0; img < img_num; img++) {
54 for (
int kern = 0; kern < k_num; kern++) {
57 for (
int i = start_i; i < end_i; i += k_row_stride) {
60 for (
int j = start_j; j < end_j; j += k_col_stride) {
61 float partial_sum = 0;
64 for (
int d = 0; d < img_chans; d++) {
66 for (
int k = 0; k < k_rows; k++) {
68 for (
int l = 0; l < k_cols; l++) {
69 float img_val = _input[img][d][i + k][j + l];
70 float kern_val = _kernels[kern][d][k][l];
71 partial_sum += img_val * kern_val;
75 _result[img][kern][out_i][out_j] = partial_sum;
83 if (act_function != NO_ACTIVATION) {
84 activation_fun(result, result, result_size, act_function, act_params);
86 dmaStore(result, result, result_size *
sizeof(
float));
113 int input_size = img_num * img_chans * img_rows * (img_cols + img_pad);
114 int kernel_size = k_num * img_chans * k_rows * (k_cols + k_pad);
115 int result_size = img_num * k_num * res_rows * (res_cols + res_pad);
116 dmaLoad(input, input, input_size *
sizeof(
float));
117 dmaLoad(kernels, kernels, kernel_size *
sizeof(
float));
119 const int total_row_pad = k_rows - 1;
120 const int total_col_pad = k_cols - 1;
121 const int left_pad = k_rows / 2;
122 const int right_pad = total_col_pad - left_pad;
123 const int top_pad = k_cols / 2;
124 const int bottom_pad = total_row_pad - top_pad;
127 const int start_i = -top_pad;
128 const int start_j = -left_pad;
129 const int end_i = img_rows + bottom_pad - k_rows + 1;
130 const int end_j = img_cols + right_pad - k_cols + 1;
132 ARRAY_4D(
float, _input, input, img_chans, img_rows, img_cols + img_pad);
133 ARRAY_4D(
float, _kernels, kernels, img_chans, k_rows, k_cols + k_pad);
134 ARRAY_4D(
float, _result, result, k_num, res_rows, res_cols + res_pad);
137 for (
int img = 0; img < img_num; img++) {
139 for (
int kern = 0; kern < k_num; kern++) {
142 for (
int i = start_i; i < end_i; i += k_row_stride) {
145 for (
int j = start_j; j < end_j; j += k_col_stride) {
146 float partial_sum = 0;
148 conv3d_kernel_height:
150 for (
int d = 0; d < img_chans; d++) {
152 for (
int k = 0; k < k_rows; k++) {
154 (i + k) >= 0 && (i + k) < img_rows;
156 for (
int l = 0; l < k_cols; l++) {
158 (j + l) >= 0 && (j + l) < img_cols;
159 float img_val = rowInBounds && colInBounds
160 ? _input[img][d][i + k][j + l]
162 float kern_val = rowInBounds && colInBounds
163 ? _kernels[kern][d][k][l]
165 partial_sum += img_val * kern_val;
169 _result[img][kern][out_i][out_j] = partial_sum;
177 if (act_function != NO_ACTIVATION) {
178 activation_fun(result, result, result_size, act_function, act_params);
180 dmaStore(result, result, result_size *
sizeof(
float));
207 int input_size = img_num * img_rows * img_cols * (img_chans + img_pad);
208 int kernel_size = k_num * k_rows * k_cols * (img_chans + k_pad);
209 int result_size = img_num * res_rows * res_cols * (k_num + res_pad);
210 dmaLoad(input, input, input_size *
sizeof(
float));
211 dmaLoad(kernels, kernels, kernel_size *
sizeof(
float));
214 const int start_i = 0;
215 const int start_j = 0;
216 const int end_i = img_rows - k_rows + 1;
217 const int end_j = img_cols - k_cols + 1;
219 ARRAY_4D(
float, _input, input, img_rows, img_cols, img_chans + img_pad);
220 ARRAY_4D(
float, _kernels, kernels, k_rows, k_cols, img_chans + k_pad);
221 ARRAY_4D(
float, _result, result, res_rows, res_cols, k_num + res_pad);
224 for (
int img = 0; img < img_num; img++) {
226 for (
int kern = 0; kern < k_num; kern++) {
229 for (
int i = start_i; i < end_i; i += k_row_stride) {
232 for (
int j = start_j; j < end_j; j += k_col_stride) {
233 float partial_sum = 0;
234 conv3d_kernel_height:
236 for (
int d = 0; d < img_chans; d++) {
238 for (
int k = 0; k < k_rows; k++) {
240 for (
int l = 0; l < k_cols; l++) {
241 float img_val = _input[img][i + k][j + l][d];
242 float kern_val = _kernels[kern][k][l][d];
243 partial_sum += img_val * kern_val;
247 _result[img][out_i][out_j][kern] = partial_sum;
255 if (act_function != NO_ACTIVATION) {
256 activation_fun(result, result, result_size, act_function, act_params);
258 dmaStore(result, result, result_size *
sizeof(
float));
285 int input_size = img_num * img_rows * img_cols * (img_chans + img_pad);
286 int kernel_size = k_num * k_rows * k_cols * (img_chans + k_pad);
287 int result_size = img_num * res_rows * res_cols * (k_num + res_pad);
288 dmaLoad(input, input, input_size *
sizeof(
float));
289 dmaLoad(kernels, kernels, kernel_size *
sizeof(
float));
291 const int total_row_pad = k_rows - 1;
292 const int total_col_pad = k_cols - 1;
293 const int left_pad = k_rows / 2;
294 const int right_pad = total_col_pad - left_pad;
295 const int top_pad = k_cols / 2;
296 const int bottom_pad = total_row_pad - top_pad;
299 const int start_i = -top_pad;
300 const int start_j = -left_pad;
301 const int end_i = img_rows + bottom_pad - k_rows + 1;
302 const int end_j = img_cols + right_pad - k_cols + 1;
304 ARRAY_4D(
float, _input, input, img_rows, img_cols, img_chans + img_pad);
305 ARRAY_4D(
float, _kernels, kernels, k_rows, k_cols, img_chans + k_pad);
306 ARRAY_4D(
float, _result, result, res_rows, res_cols, k_num + res_pad);
309 for (
int img = 0; img < img_num; img++) {
311 for (
int kern = 0; kern < k_num; kern++) {
314 for (
int i = start_i; i < end_i; i += k_row_stride) {
317 for (
int j = start_j; j < end_j; j += k_col_stride) {
318 float partial_sum = 0;
320 conv3d_kernel_height:
322 for (
int d = 0; d < img_chans; d++) {
324 for (
int k = 0; k < k_rows; k++) {
326 (i + k) >= 0 && (i + k) < img_rows;
328 for (
int l = 0; l < k_cols; l++) {
330 (j + l) >= 0 && (j + l) < img_cols;
331 float img_val = rowInBounds && colInBounds
332 ? _input[img][i + k][j + l][d]
334 float kern_val = rowInBounds && colInBounds
335 ? _kernels[kern][k][l][d]
337 partial_sum += img_val * kern_val;
341 _result[img][out_i][out_j][kern] = partial_sum;
349 if (act_function != NO_ACTIVATION) {
350 activation_fun(result, result, result_size, act_function, act_params);
352 dmaStore(result, result, result_size *
sizeof(
float));
362 void ConvolutionOp<ReferenceBackend>::run() {
363 auto input = getInput(Inputs);
364 auto kernels = getInput(Kernels);
365 auto output = getOutput(Outputs);
366 const TensorShape& inputShape = input->getShape();
367 const TensorShape& kernelShape = kernels->getShape();
368 const TensorShape& outputShape = output->getShape();
369 dout(2) << *kernels <<
"\n";
371 float* inputData = input->data<
float>();
372 float* kernelData = kernels->data<
float>();
373 float* outputData = output->data<
float>();
375 inputShape.storageSize() *
sizeof(
float));
377 kernelShape.storageSize() *
sizeof(
float));
379 outputShape.storageSize() *
sizeof(
float));
380 bool isNCHW = input->getShape().getLayout() == NCHW;
381 auto func = isNCHW ? (paddingType == ValidPadding
384 : (paddingType == ValidPadding
387 int rowIdx = isNCHW ? 2 : 1;
388 int colIdx = isNCHW ? 3 : 2;
389 int chanIdx = isNCHW ? 1 : 3;
390 invokeKernel(ref::kConvolutionHw, func, inputData, kernelData, outputData,
391 inputShape[0], inputShape[chanIdx], inputShape[rowIdx],
392 inputShape[colIdx], inputShape.getPadding(3), kernelShape[0],
393 kernelShape[rowIdx], kernelShape[colIdx],
394 kernelShape.getPadding(3), getRowStride(), getColStride(),
395 outputShape[rowIdx], outputShape[colIdx],
396 outputShape.getPadding(3), actInfo.function, actInfo.params);