SMAUG
Simulating Machine Learning Applications on gem5-Aladdin
ref_convolution_op.cpp
1 #include "smaug/core/backend.h"
3 #include "smaug/operators/convolution_op.h"
4 #include "smaug/operators/ref/ref_activation_fun_op.h"
5 #include "smaug/utility/debug_stream.h"
6 
7 #ifdef __cplusplus
8 extern "C" {
9 #endif
10 
17  float* kernels,
18  float* result,
19  int img_num,
20  int img_chans,
21  int img_rows,
22  int img_cols,
23  int img_pad,
24  int k_num,
25  int k_rows,
26  int k_cols,
27  int k_pad,
28  int k_row_stride,
29  int k_col_stride,
30  int res_rows,
31  int res_cols,
32  int res_pad,
33  activation_type act_function,
34  activation_param_t act_params) {
35  int input_size = img_num * img_chans * img_rows * (img_cols + img_pad);
36  int kernel_size = k_num * img_chans * k_rows * (k_cols + k_pad);
37  int result_size = img_num * k_num * res_rows * (res_cols + res_pad);
38  dmaLoad(input, input, input_size * sizeof(float));
39  dmaLoad(kernels, kernels, kernel_size * sizeof(float));
40 
41  // Convolution borders.
42  const int start_i = 0;
43  const int start_j = 0;
44  const int end_i = img_rows - k_rows + 1;
45  const int end_j = img_cols - k_cols + 1;
46 
47  ARRAY_4D(float, _input, input, img_chans, img_rows, img_cols + img_pad);
48  ARRAY_4D(float, _kernels, kernels, img_chans, k_rows, k_cols + k_pad);
49  ARRAY_4D(float, _result, result, k_num, res_rows, res_cols + res_pad);
50 
51  conv3d_input_num:
52  for (int img = 0; img < img_num; img++) {
53  conv3d_kern_num:
54  for (int kern = 0; kern < k_num; kern++) {
55  int out_i = 0;
56  conv3d_input_rows:
57  for (int i = start_i; i < end_i; i += k_row_stride) {
58  int out_j = 0;
59  conv3d_input_cols:
60  for (int j = start_j; j < end_j; j += k_col_stride) {
61  float partial_sum = 0;
62  conv3d_kernel_height:
63  // Convolution loop over the kernel.
64  for (int d = 0; d < img_chans; d++) {
65  conv3d_kernel_rows:
66  for (int k = 0; k < k_rows; k++) {
67  conv3d_kernel_cols:
68  for (int l = 0; l < k_cols; l++) {
69  float img_val = _input[img][d][i + k][j + l];
70  float kern_val = _kernels[kern][d][k][l];
71  partial_sum += img_val * kern_val;
72  }
73  }
74  }
75  _result[img][kern][out_i][out_j] = partial_sum;
76  out_j++;
77  }
78  out_i++;
79  out_j = 0;
80  }
81  }
82  }
83  if (act_function != NO_ACTIVATION) {
84  activation_fun(result, result, result_size, act_function, act_params);
85  }
86  dmaStore(result, result, result_size * sizeof(float));
87 }
88 
94 void ref_conv3d_nchw_same_padding(float* input,
95  float* kernels,
96  float* result,
97  int img_num,
98  int img_chans,
99  int img_rows,
100  int img_cols,
101  int img_pad,
102  int k_num,
103  int k_rows,
104  int k_cols,
105  int k_pad,
106  int k_row_stride,
107  int k_col_stride,
108  int res_rows,
109  int res_cols,
110  int res_pad,
111  activation_type act_function,
112  activation_param_t act_params) {
113  int input_size = img_num * img_chans * img_rows * (img_cols + img_pad);
114  int kernel_size = k_num * img_chans * k_rows * (k_cols + k_pad);
115  int result_size = img_num * k_num * res_rows * (res_cols + res_pad);
116  dmaLoad(input, input, input_size * sizeof(float));
117  dmaLoad(kernels, kernels, kernel_size * sizeof(float));
118 
119  const int total_row_pad = k_rows - 1;
120  const int total_col_pad = k_cols - 1;
121  const int left_pad = k_rows / 2;
122  const int right_pad = total_col_pad - left_pad;
123  const int top_pad = k_cols / 2;
124  const int bottom_pad = total_row_pad - top_pad;
125 
126  // Convolution borders.
127  const int start_i = -top_pad;
128  const int start_j = -left_pad;
129  const int end_i = img_rows + bottom_pad - k_rows + 1;
130  const int end_j = img_cols + right_pad - k_cols + 1;
131 
132  ARRAY_4D(float, _input, input, img_chans, img_rows, img_cols + img_pad);
133  ARRAY_4D(float, _kernels, kernels, img_chans, k_rows, k_cols + k_pad);
134  ARRAY_4D(float, _result, result, k_num, res_rows, res_cols + res_pad);
135 
136  conv3d_input_num:
137  for (int img = 0; img < img_num; img++) {
138  conv3d_kern_num:
139  for (int kern = 0; kern < k_num; kern++) {
140  int out_i = 0;
141  conv3d_input_rows:
142  for (int i = start_i; i < end_i; i += k_row_stride) {
143  int out_j = 0;
144  conv3d_input_cols:
145  for (int j = start_j; j < end_j; j += k_col_stride) {
146  float partial_sum = 0;
147 
148  conv3d_kernel_height:
149  // Convolution loop over the kernel.
150  for (int d = 0; d < img_chans; d++) {
151  conv3d_kernel_rows:
152  for (int k = 0; k < k_rows; k++) {
153  bool rowInBounds =
154  (i + k) >= 0 && (i + k) < img_rows;
155  conv3d_kernel_cols:
156  for (int l = 0; l < k_cols; l++) {
157  bool colInBounds =
158  (j + l) >= 0 && (j + l) < img_cols;
159  float img_val = rowInBounds && colInBounds
160  ? _input[img][d][i + k][j + l]
161  : 0;
162  float kern_val = rowInBounds && colInBounds
163  ? _kernels[kern][d][k][l]
164  : 0;
165  partial_sum += img_val * kern_val;
166  }
167  }
168  }
169  _result[img][kern][out_i][out_j] = partial_sum;
170  out_j++;
171  }
172  out_i++;
173  out_j = 0;
174  }
175  }
176  }
177  if (act_function != NO_ACTIVATION) {
178  activation_fun(result, result, result_size, act_function, act_params);
179  }
180  dmaStore(result, result, result_size * sizeof(float));
181 }
182 
189  float* kernels,
190  float* result,
191  int img_num,
192  int img_chans,
193  int img_rows,
194  int img_cols,
195  int img_pad,
196  int k_num,
197  int k_rows,
198  int k_cols,
199  int k_pad,
200  int k_row_stride,
201  int k_col_stride,
202  int res_rows,
203  int res_cols,
204  int res_pad,
205  activation_type act_function,
206  activation_param_t act_params) {
207  int input_size = img_num * img_rows * img_cols * (img_chans + img_pad);
208  int kernel_size = k_num * k_rows * k_cols * (img_chans + k_pad);
209  int result_size = img_num * res_rows * res_cols * (k_num + res_pad);
210  dmaLoad(input, input, input_size * sizeof(float));
211  dmaLoad(kernels, kernels, kernel_size * sizeof(float));
212 
213  // Convolution borders.
214  const int start_i = 0;
215  const int start_j = 0;
216  const int end_i = img_rows - k_rows + 1;
217  const int end_j = img_cols - k_cols + 1;
218 
219  ARRAY_4D(float, _input, input, img_rows, img_cols, img_chans + img_pad);
220  ARRAY_4D(float, _kernels, kernels, k_rows, k_cols, img_chans + k_pad);
221  ARRAY_4D(float, _result, result, res_rows, res_cols, k_num + res_pad);
222 
223  conv3d_input_num:
224  for (int img = 0; img < img_num; img++) {
225  conv3d_kern_num:
226  for (int kern = 0; kern < k_num; kern++) {
227  int out_i = 0;
228  conv3d_input_rows:
229  for (int i = start_i; i < end_i; i += k_row_stride) {
230  int out_j = 0;
231  conv3d_input_cols:
232  for (int j = start_j; j < end_j; j += k_col_stride) {
233  float partial_sum = 0;
234  conv3d_kernel_height:
235  // Convolution loop over the kernel.
236  for (int d = 0; d < img_chans; d++) {
237  conv3d_kernel_rows:
238  for (int k = 0; k < k_rows; k++) {
239  conv3d_kernel_cols:
240  for (int l = 0; l < k_cols; l++) {
241  float img_val = _input[img][i + k][j + l][d];
242  float kern_val = _kernels[kern][k][l][d];
243  partial_sum += img_val * kern_val;
244  }
245  }
246  }
247  _result[img][out_i][out_j][kern] = partial_sum;
248  out_j++;
249  }
250  out_i++;
251  out_j = 0;
252  }
253  }
254  }
255  if (act_function != NO_ACTIVATION) {
256  activation_fun(result, result, result_size, act_function, act_params);
257  }
258  dmaStore(result, result, result_size * sizeof(float));
259 }
260 
267  float* kernels,
268  float* result,
269  int img_num,
270  int img_chans,
271  int img_rows,
272  int img_cols,
273  int img_pad,
274  int k_num,
275  int k_rows,
276  int k_cols,
277  int k_pad,
278  int k_row_stride,
279  int k_col_stride,
280  int res_rows,
281  int res_cols,
282  int res_pad,
283  activation_type act_function,
284  activation_param_t act_params) {
285  int input_size = img_num * img_rows * img_cols * (img_chans + img_pad);
286  int kernel_size = k_num * k_rows * k_cols * (img_chans + k_pad);
287  int result_size = img_num * res_rows * res_cols * (k_num + res_pad);
288  dmaLoad(input, input, input_size * sizeof(float));
289  dmaLoad(kernels, kernels, kernel_size * sizeof(float));
290 
291  const int total_row_pad = k_rows - 1;
292  const int total_col_pad = k_cols - 1;
293  const int left_pad = k_rows / 2;
294  const int right_pad = total_col_pad - left_pad;
295  const int top_pad = k_cols / 2;
296  const int bottom_pad = total_row_pad - top_pad;
297 
298  // Convolution borders.
299  const int start_i = -top_pad;
300  const int start_j = -left_pad;
301  const int end_i = img_rows + bottom_pad - k_rows + 1;
302  const int end_j = img_cols + right_pad - k_cols + 1;
303 
304  ARRAY_4D(float, _input, input, img_rows, img_cols, img_chans + img_pad);
305  ARRAY_4D(float, _kernels, kernels, k_rows, k_cols, img_chans + k_pad);
306  ARRAY_4D(float, _result, result, res_rows, res_cols, k_num + res_pad);
307 
308  conv3d_input_num:
309  for (int img = 0; img < img_num; img++) {
310  conv3d_kern_num:
311  for (int kern = 0; kern < k_num; kern++) {
312  int out_i = 0;
313  conv3d_input_rows:
314  for (int i = start_i; i < end_i; i += k_row_stride) {
315  int out_j = 0;
316  conv3d_input_cols:
317  for (int j = start_j; j < end_j; j += k_col_stride) {
318  float partial_sum = 0;
319 
320  conv3d_kernel_height:
321  // Convolution loop over the kernel.
322  for (int d = 0; d < img_chans; d++) {
323  conv3d_kernel_rows:
324  for (int k = 0; k < k_rows; k++) {
325  bool rowInBounds =
326  (i + k) >= 0 && (i + k) < img_rows;
327  conv3d_kernel_cols:
328  for (int l = 0; l < k_cols; l++) {
329  bool colInBounds =
330  (j + l) >= 0 && (j + l) < img_cols;
331  float img_val = rowInBounds && colInBounds
332  ? _input[img][i + k][j + l][d]
333  : 0;
334  float kern_val = rowInBounds && colInBounds
335  ? _kernels[kern][k][l][d]
336  : 0;
337  partial_sum += img_val * kern_val;
338  }
339  }
340  }
341  _result[img][out_i][out_j][kern] = partial_sum;
342  out_j++;
343  }
344  out_i++;
345  out_j = 0;
346  }
347  }
348  }
349  if (act_function != NO_ACTIVATION) {
350  activation_fun(result, result, result_size, act_function, act_params);
351  }
352  dmaStore(result, result, result_size * sizeof(float));
353 }
354 
355 #ifdef __cplusplus
356 }
357 #endif
358 
359 namespace smaug {
360 
361 template <>
362 void ConvolutionOp<ReferenceBackend>::run() {
363  auto input = getInput(Inputs);
364  auto kernels = getInput(Kernels);
365  auto output = getOutput(Outputs);
366  const TensorShape& inputShape = input->getShape();
367  const TensorShape& kernelShape = kernels->getShape();
368  const TensorShape& outputShape = output->getShape();
369  dout(2) << *kernels << "\n";
370 
371  float* inputData = input->data<float>();
372  float* kernelData = kernels->data<float>();
373  float* outputData = output->data<float>();
374  mapArrayToAccel(ref::kConvolutionHw, "input", inputData,
375  inputShape.storageSize() * sizeof(float));
376  mapArrayToAccel(ref::kConvolutionHw, "kernels", kernelData,
377  kernelShape.storageSize() * sizeof(float));
378  mapArrayToAccel(ref::kConvolutionHw, "result", outputData,
379  outputShape.storageSize() * sizeof(float));
380  bool isNCHW = input->getShape().getLayout() == NCHW;
381  auto func = isNCHW ? (paddingType == ValidPadding
384  : (paddingType == ValidPadding
387  int rowIdx = isNCHW ? 2 : 1;
388  int colIdx = isNCHW ? 3 : 2;
389  int chanIdx = isNCHW ? 1 : 3;
390  invokeKernel(ref::kConvolutionHw, func, inputData, kernelData, outputData,
391  inputShape[0], inputShape[chanIdx], inputShape[rowIdx],
392  inputShape[colIdx], inputShape.getPadding(3), kernelShape[0],
393  kernelShape[rowIdx], kernelShape[colIdx],
394  kernelShape.getPadding(3), getRowStride(), getColStride(),
395  outputShape[rowIdx], outputShape[colIdx],
396  outputShape.getPadding(3), actInfo.function, actInfo.params);
397 }
398 
399 } // namespace smaug
activation_type
enum _activation_type activation_type
The activation function to apply to an operator's output in hardware.
smaug::dout
const DebugStream & dout(int debugLevel)
Returns a DebugStream instance for the given debug level.
Definition: debug_stream.cpp:16
ref_conv3d_nchw_valid_padding
void ref_conv3d_nchw_valid_padding(float *input, float *kernels, float *result, int img_num, int img_chans, int img_rows, int img_cols, int img_pad, int k_num, int k_rows, int k_cols, int k_pad, int k_row_stride, int k_col_stride, int res_rows, int res_cols, int res_pad, activation_type act_function, activation_param_t act_params)
Definition: ref_convolution_op.cpp:16
_activation_param_t
Parameters to the activation function hardware.
Definition: common.h:194
ref_conv3d_nchw_same_padding
void ref_conv3d_nchw_same_padding(float *input, float *kernels, float *result, int img_num, int img_chans, int img_rows, int img_cols, int img_pad, int k_num, int k_rows, int k_cols, int k_pad, int k_row_stride, int k_col_stride, int res_rows, int res_cols, int res_pad, activation_type act_function, activation_param_t act_params)
Definition: ref_convolution_op.cpp:94
smaug
The smaug namespace is the parent namespace of all C++ code in SMAUG.
Definition: backend.cpp:38
common.h
Utilities for writing and invoking Aladdin kernels from Operators.
smaug::mapArrayToAccel
void mapArrayToAccel(unsigned reqCode, const char *arrayName, void *baseAddr, size_t size)
Maps an array of data to the accelerator.
Definition: common.cpp:12
smaug::invokeKernel
void invokeKernel(int accelIdx, unsigned reqCode, const Kernel &kernel, Args &&... args)
The generic blocking interface for all accelerator kernel functions.
Definition: common.h:72
ref_conv3d_nhwc_same_padding
void ref_conv3d_nhwc_same_padding(float *input, float *kernels, float *result, int img_num, int img_chans, int img_rows, int img_cols, int img_pad, int k_num, int k_rows, int k_cols, int k_pad, int k_row_stride, int k_col_stride, int res_rows, int res_cols, int res_pad, activation_type act_function, activation_param_t act_params)
Definition: ref_convolution_op.cpp:266
ref_conv3d_nhwc_valid_padding
void ref_conv3d_nhwc_valid_padding(float *input, float *kernels, float *result, int img_num, int img_chans, int img_rows, int img_cols, int img_pad, int k_num, int k_rows, int k_cols, int k_pad, int k_row_stride, int k_col_stride, int res_rows, int res_cols, int res_pad, activation_type act_function, activation_param_t act_params)
Definition: ref_convolution_op.cpp:188