SMAUG
Simulating Machine Learning Applications on gem5-Aladdin
ref_depthwise_convolution_op.cpp
1 #include "smaug/core/backend.h"
3 #include "smaug/operators/depthwise_convolution_op.h"
4 
5 #ifdef __cplusplus
6 extern "C" {
7 #endif
8 
16  float* kernels,
17  float* result,
18  int img_num,
19  int img_chans,
20  int img_rows,
21  int img_cols,
22  int img_pad,
23  int k_rows,
24  int k_cols,
25  int k_pad,
26  int k_row_stride,
27  int k_col_stride,
28  int res_rows,
29  int res_cols,
30  int res_pad) {
31  int input_size = img_num * img_chans * img_rows * (img_cols + img_pad);
32  int kernel_size = img_chans * k_rows * (k_cols + k_pad);
33  int result_size = img_num * img_chans * res_rows * (res_cols + res_pad);
34  dmaLoad(input, input, input_size * sizeof(float));
35  dmaLoad(kernels, kernels, kernel_size * sizeof(float));
36 
37  // Convolution borders.
38  const int start_i = 0;
39  const int start_j = 0;
40  const int end_i = img_rows - k_rows + 1;
41  const int end_j = img_cols - k_cols + 1;
42 
43  ARRAY_4D(float, _input, input, img_chans, img_rows, img_cols + img_pad);
44  ARRAY_3D(float, _kernels, kernels, k_rows, k_cols + k_pad);
45  ARRAY_4D(float, _result, result, img_chans, res_rows, res_cols + res_pad);
46 
47  conv2d_input_num:
48  for (int img = 0; img < img_num; img++) {
49  conv2d_kern_num:
50  for (int kern = 0; kern < img_chans; kern++) {
51  int out_i = 0;
52  conv2d_input_rows:
53  for (int i = start_i; i < end_i; i += k_row_stride) {
54  int out_j = 0;
55  conv2d_input_cols:
56  for (int j = start_j; j < end_j; j += k_col_stride) {
57  float partial_sum = 0;
58  conv2d_kernel_rows:
59  for (int k = 0; k < k_rows; k++) {
60  conv2d_kernel_cols:
61  for (int l = 0; l < k_cols; l++) {
62  int img_val = _input[img][kern][i + k][j + l];
63  int kern_val = _kernels[kern][k][l];
64  partial_sum += img_val * kern_val;
65  }
66  }
67  _result[img][kern][out_i][out_j] = partial_sum;
68  out_j++;
69  }
70  out_i++;
71  out_j = 0;
72  }
73  }
74  }
75  dmaStore(result, result, result_size * sizeof(float));
76 }
77 
83 void ref_conv2d_nchw_same_padding(float* input,
84  float* kernels,
85  float* result,
86  int img_num,
87  int img_chans,
88  int img_rows,
89  int img_cols,
90  int img_pad,
91  int k_rows,
92  int k_cols,
93  int k_pad,
94  int k_row_stride,
95  int k_col_stride,
96  int res_rows,
97  int res_cols,
98  int res_pad) {
99  int input_size = img_num * img_chans * img_rows * (img_cols + img_pad);
100  int kernel_size = img_chans * k_rows * (k_cols + k_pad);
101  int result_size = img_num * img_chans * res_rows * (res_cols + res_pad);
102  dmaLoad(input, input, input_size * sizeof(float));
103  dmaLoad(kernels, kernels, kernel_size * sizeof(float));
104 
105  const int total_row_pad = k_rows - 1;
106  const int total_col_pad = k_cols - 1;
107  const int left_pad = k_rows / 2;
108  const int right_pad = total_col_pad - left_pad;
109  const int top_pad = k_cols / 2;
110  const int bottom_pad = total_row_pad - top_pad;
111 
112  // Convolution borders.
113  const int start_i = -top_pad;
114  const int start_j = -left_pad;
115  const int end_i = img_rows - bottom_pad;
116  const int end_j = img_cols - right_pad;
117 
118  ARRAY_4D(float, _input, input, img_chans, img_rows, img_cols + img_pad);
119  ARRAY_3D(float, _kernels, kernels, k_rows, k_cols + k_pad);
120  ARRAY_4D(float, _result, result, img_chans, res_rows, res_cols + res_pad);
121 
122  conv2d_input_num:
123  for (int img = 0; img < img_num; img++) {
124  conv2d_kern_num:
125  for (int kern = 0; kern < img_chans; kern++) {
126  int out_i = 0;
127  conv2d_input_rows:
128  for (int i = start_i; i < end_i; i += k_row_stride) {
129  int out_j = 0;
130  conv2d_input_cols:
131  for (int j = start_j; j < end_j; j += k_col_stride) {
132  float partial_sum = 0;
133  conv2d_kernel_rows:
134  for (int k = 0; k < k_rows; k++) {
135  bool rowInBounds =
136  (i + k) >= 0 && (i + k) < img_rows;
137  conv2d_kernel_cols:
138  for (int l = 0; l < k_cols; l++) {
139  bool colInBounds =
140  (j + l) >= 0 && (j + l) < img_cols;
141  float img_val = rowInBounds && colInBounds
142  ? _input[img][kern][i + k][j + l]
143  : 0;
144  float kern_val = rowInBounds && colInBounds
145  ? _kernels[kern][k][l]
146  : 0;
147  partial_sum += img_val * kern_val;
148  }
149  }
150  _result[img][kern][out_i][out_j] = partial_sum;
151  out_j++;
152  }
153  out_i++;
154  out_j = 0;
155  }
156  }
157  }
158  dmaStore(result, result, result_size * sizeof(float));
159 }
160 
161 #ifdef __cplusplus
162 }
163 #endif
164 
165 namespace smaug {
166 
167 template <>
168 void DepthwiseConvolutionOp<ReferenceBackend>::run() {
169  auto input = getInput(Inputs);
170  auto kernels = getInput(Kernels);
171  auto output = getOutput(Outputs);
172  const TensorShape& inputShape = input->getShape();
173  const TensorShape& kernelShape = kernels->getShape();
174  const TensorShape& outputShape = output->getShape();
175  assert(inputShape.getLayout() == DataLayout::NCHW);
176  assert(kernelShape.getLayout() == DataLayout::NCHW);
177  assert(outputShape.getLayout() == DataLayout::NCHW);
178 
179  float* inputData = input->data<float>();
180  float* kernelData = kernels->data<float>();
181  float* outputData = output->data<float>();
182  mapArrayToAccel(ref::kConvolutionHw, "input", inputData,
183  inputShape.storageSize() * sizeof(float));
184  mapArrayToAccel(ref::kConvolutionHw, "kernels", kernelData,
185  kernelShape.storageSize() * sizeof(float));
186  mapArrayToAccel(ref::kConvolutionHw, "result", outputData,
187  outputShape.storageSize() * sizeof(float));
188  auto func = paddingType == ValidPadding ? ref_conv2d_nchw_valid_padding
190  invokeKernel(ref::kConvolutionHw, func, inputData, kernelData, outputData,
191  inputShape[0], inputShape[1], inputShape[2], inputShape[3],
192  inputShape.getPadding(3), kernelShape[2], kernelShape[3],
193  kernelShape.getPadding(3), getRowStride(), getColStride(),
194  outputShape[2], outputShape[3], outputShape.getPadding(3));
195 }
196 
197 } // namespace smaug
ref_conv2d_nchw_same_padding
void ref_conv2d_nchw_same_padding(float *input, float *kernels, float *result, int img_num, int img_chans, int img_rows, int img_cols, int img_pad, int k_rows, int k_cols, int k_pad, int k_row_stride, int k_col_stride, int res_rows, int res_cols, int res_pad)
Definition: ref_depthwise_convolution_op.cpp:83
ref_conv2d_nchw_valid_padding
void ref_conv2d_nchw_valid_padding(float *input, float *kernels, float *result, int img_num, int img_chans, int img_rows, int img_cols, int img_pad, int k_rows, int k_cols, int k_pad, int k_row_stride, int k_col_stride, int res_rows, int res_cols, int res_pad)
Definition: ref_depthwise_convolution_op.cpp:15
smaug
The smaug namespace is the parent namespace of all C++ code in SMAUG.
Definition: backend.cpp:38
common.h
Utilities for writing and invoking Aladdin kernels from Operators.
smaug::mapArrayToAccel
void mapArrayToAccel(unsigned reqCode, const char *arrayName, void *baseAddr, size_t size)
Maps an array of data to the accelerator.
Definition: common.cpp:12
smaug::invokeKernel
void invokeKernel(int accelIdx, unsigned reqCode, const Kernel &kernel, Args &&... args)
The generic blocking interface for all accelerator kernel functions.
Definition: common.h:72