SMAUG
Simulating Machine Learning Applications on gem5-Aladdin
ref_pooling_op.cpp
1 #include <utility>
2 
3 #include "smaug/core/backend.h"
5 #include "smaug/operators/pooling_op.h"
6 
7 #ifdef __cplusplus
8 extern "C" {
9 #endif
10 
15 void ref_max_pooling_nchw_treemax(float* input,
16  float* result,
17  int img_num,
18  int img_chans,
19  int img_rows,
20  int img_cols,
21  int img_pad,
22  int res_rows,
23  int res_cols,
24  int res_pad,
25  int pool_row_size,
26  int pool_col_size,
27  int pool_row_stride,
28  int pool_col_stride) {
29  int total_pool_size = pool_row_size * pool_col_size;
30  int end_row = img_rows - pool_row_size + 1;
31  int end_col = img_cols - pool_col_size + 1;
32  float elems[total_pool_size];
33  int elem_idx;
34 
35  ARRAY_4D(float, _input, input, img_chans, img_rows, img_cols + img_pad);
36  ARRAY_4D(float, _result, result, img_chans, res_rows, res_cols + res_pad);
37 
38  maxpool_input_num:
39  for (int img = 0; img < img_num; img++) {
40  maxpool_input_height:
41  for (int h = 0; h < img_chans; h++) {
42  int oi = 0;
43  maxpool_input_rows:
44  for (int i = 0; i < end_row; i += pool_row_stride) {
45  int oj = 0;
46  maxpool_input_cols:
47  for (int j = 0; j < end_col; j += pool_col_stride) {
48  elem_idx = 0;
49  maxpool_tree_outer:
50  // Iterate over the pooling field.
51  for (int k = 0; k < pool_row_size; k++) {
52  maxpool_tree_inner:
53  for (int l = 0; l < pool_col_size; l++) {
54  elems[elem_idx] = _input[img][h][i+k][j+l];
55  elem_idx++;
56  }
57  }
58 
59  float curr_max = 0;
60  if (total_pool_size == 4)
61  curr_max = max4(elems[0], elems[1], elems[2], elems[3]);
62  else if (total_pool_size == 9)
63  curr_max = max9(elems[0], elems[1], elems[2], elems[3],
64  elems[4], elems[5], elems[6], elems[7],
65  elems[8]);
66  else
67  assert(false && "Unsupported pooling size!");
68 
69  _result[img][h][oi][oj] = curr_max;
70  oj++;
71  }
72  oi++;
73  oj = 0;
74  }
75  }
76  }
77 }
78 
83 void ref_max_pooling_nhwc_treemax(float* input,
84  float* result,
85  int img_num,
86  int img_chans,
87  int img_rows,
88  int img_cols,
89  int img_pad,
90  int res_rows,
91  int res_cols,
92  int res_pad,
93  int pool_row_size,
94  int pool_col_size,
95  int pool_row_stride,
96  int pool_col_stride) {
97  int total_pool_size = pool_row_size * pool_col_size;
98  int end_row = img_rows - pool_row_size + 1;
99  int end_col = img_cols - pool_col_size + 1;
100  float elems[total_pool_size];
101  int elem_idx;
102 
103  ARRAY_4D(float, _input, input, img_rows, img_cols, img_chans + img_pad);
104  ARRAY_4D(float, _result, result, res_rows, res_cols, img_chans + res_pad);
105 
106  maxpool_input_num:
107  for (int img = 0; img < img_num; img++) {
108  maxpool_input_height:
109  for (int h = 0; h < img_chans; h++) {
110  int oi = 0;
111  maxpool_input_rows:
112  for (int i = 0; i < end_row; i += pool_row_stride) {
113  int oj = 0;
114  maxpool_input_cols:
115  for (int j = 0; j < end_col; j += pool_col_stride) {
116  elem_idx = 0;
117  maxpool_tree_outer:
118  // Iterate over the pooling field.
119  for (int k = 0; k < pool_row_size; k++) {
120  maxpool_tree_inner:
121  for (int l = 0; l < pool_col_size; l++) {
122  elems[elem_idx] = _input[img][i+k][j+l][h];
123  elem_idx++;
124  }
125  }
126 
127  float curr_max = 0;
128  if (total_pool_size == 4)
129  curr_max = max4(elems[0], elems[1], elems[2], elems[3]);
130  else if (total_pool_size == 9)
131  curr_max = max9(elems[0], elems[1], elems[2], elems[3],
132  elems[4], elems[5], elems[6], elems[7],
133  elems[8]);
134  else
135  assert(false && "Unsupported pooling size!");
136 
137  _result[img][oi][oj][h] = curr_max;
138  oj++;
139  }
140  oi++;
141  oj = 0;
142  }
143  }
144  }
145 }
146 
152  float* result,
153  int img_num,
154  int img_chans,
155  int img_rows,
156  int img_cols,
157  int img_pad,
158  int res_rows,
159  int res_cols,
160  int res_pad,
161  int pool_row_size,
162  int pool_col_size,
163  int pool_row_stride,
164  int pool_col_stride) {
165  int end_row = img_rows - pool_row_size + 1;
166  int end_col = img_cols - pool_col_size + 1;
167  ARRAY_4D(float, _input, input, img_chans, img_rows, img_cols + img_pad);
168  ARRAY_4D(float, _result, result, img_chans, res_rows, res_cols + res_pad);
169 
170  maxpool_input_num:
171  for (int img = 0; img < img_num; img++) {
172  maxpool_input_height:
173  for (int h = 0; h < img_chans; h++) {
174  int oi = 0;
175  maxpool_input_rows:
176  for (int i = 0; i < end_row; i += pool_row_stride) {
177  int oj = 0;
178  maxpool_input_cols:
179  for (int j = 0; j < end_col; j += pool_col_stride) {
180  float curr_max = -FLT_MAX;
181  maxpool_iter_outer:
182  for (int k = 0; k < pool_row_size; k++) {
183  maxpool_iter_inner:
184  for (int l = 0; l < pool_col_size; l++) {
185  float in_val = _input[img][h][i+k][j+l];
186  curr_max = max2(in_val, curr_max);
187  }
188  }
189 
190  _result[img][h][oi][oj] = curr_max;
191  oj++;
192  }
193  oi++;
194  oj = 0;
195  }
196  }
197  }
198 }
199 
205  float* result,
206  int img_num,
207  int img_chans,
208  int img_rows,
209  int img_cols,
210  int img_pad,
211  int res_rows,
212  int res_cols,
213  int res_pad,
214  int pool_row_size,
215  int pool_col_size,
216  int pool_row_stride,
217  int pool_col_stride) {
218  int end_row = img_rows - pool_row_size + 1;
219  int end_col = img_cols - pool_col_size + 1;
220  ARRAY_4D(float, _input, input, img_rows, img_cols, img_chans + img_pad);
221  ARRAY_4D(float, _result, result, res_rows, res_cols, img_chans + res_pad);
222 
223  maxpool_input_num:
224  for (int img = 0; img < img_num; img++) {
225  maxpool_input_height:
226  for (int h = 0; h < img_chans; h++) {
227  int oi = 0;
228  maxpool_input_rows:
229  for (int i = 0; i < end_row; i += pool_row_stride) {
230  int oj = 0;
231  maxpool_input_cols:
232  for (int j = 0; j < end_col; j += pool_col_stride) {
233  float curr_max = -FLT_MAX;
234  maxpool_iter_outer:
235  for (int k = 0; k < pool_row_size; k++) {
236  maxpool_iter_inner:
237  for (int l = 0; l < pool_col_size; l++) {
238  float in_val = _input[img][i+k][j+l][h];
239  curr_max = max2(in_val, curr_max);
240  }
241  }
242 
243  _result[img][oi][oj][h] = curr_max;
244  oj++;
245  }
246  oi++;
247  oj = 0;
248  }
249  }
250  }
251 }
252 
256 void ref_avg_pooling_nchw(float* input,
257  float* result,
258  int img_num,
259  int img_chans,
260  int img_rows,
261  int img_cols,
262  int img_pad,
263  int res_rows,
264  int res_cols,
265  int res_pad,
266  int pool_row_size,
267  int pool_col_size,
268  int pool_row_stride,
269  int pool_col_stride) {
270  int end_row = img_rows - pool_row_size + 1;
271  int end_col = img_cols - pool_col_size + 1;
272  ARRAY_4D(float, _input, input, img_chans, img_rows, img_cols + img_pad);
273  ARRAY_4D(float, _result, result, img_chans, res_rows, res_cols + res_pad);
274  float recip_total_size = 1.0 / (pool_row_size * pool_col_size);
275 
276  maxpool_input_num:
277  for (int img = 0; img < img_num; img++) {
278  maxpool_input_height:
279  for (int h = 0; h < img_chans; h++) {
280  int oi = 0;
281  maxpool_input_rows:
282  for (int i = 0; i < end_row; i += pool_row_stride) {
283  int oj = 0;
284  maxpool_input_cols:
285  for (int j = 0; j < end_col; j += pool_col_stride) {
286  float curr_sum = 0;
287  avgpool_iter_outer:
288  for (int k = 0; k < pool_row_size; k++) {
289  avgpool_iter_inner:
290  for (int l = 0; l < pool_col_size; l++) {
291  curr_sum += _input[img][h][i+k][j+l];
292  }
293  }
294 
295  _result[img][h][oi][oj] = curr_sum * recip_total_size;
296  oj++;
297  }
298  oi++;
299  oj = 0;
300  }
301  }
302  }
303 }
304 
308 void ref_avg_pooling_nhwc(float* input,
309  float* result,
310  int img_num,
311  int img_chans,
312  int img_rows,
313  int img_cols,
314  int img_pad,
315  int res_rows,
316  int res_cols,
317  int res_pad,
318  int pool_row_size,
319  int pool_col_size,
320  int pool_row_stride,
321  int pool_col_stride) {
322  int end_row = img_rows - pool_row_size + 1;
323  int end_col = img_cols - pool_col_size + 1;
324  ARRAY_4D(float, _input, input, img_rows, img_cols, img_chans + img_pad);
325  ARRAY_4D(float, _result, result, res_rows, res_cols, img_chans + res_pad);
326  float recip_total_size = 1.0 / (pool_row_size * pool_col_size);
327 
328  maxpool_input_num:
329  for (int img = 0; img < img_num; img++) {
330  maxpool_input_height:
331  for (int h = 0; h < img_chans; h++) {
332  int oi = 0;
333  maxpool_input_rows:
334  for (int i = 0; i < end_row; i += pool_row_stride) {
335  int oj = 0;
336  maxpool_input_cols:
337  for (int j = 0; j < end_col; j += pool_col_stride) {
338  float curr_sum = 0;
339  avgpool_iter_outer:
340  for (int k = 0; k < pool_row_size; k++) {
341  avgpool_iter_inner:
342  for (int l = 0; l < pool_col_size; l++) {
343  curr_sum += _input[img][i+k][j+l][h];
344  }
345  }
346 
347  _result[img][oi][oj][h] = curr_sum * recip_total_size;
348  oj++;
349  }
350  oi++;
351  oj = 0;
352  }
353  }
354  }
355 }
356 
357 #ifdef __cplusplus
358 }
359 #endif
360 
361 namespace smaug {
362 
363 template <>
364 void MaxPoolingOp<ReferenceBackend>::run() {
365  auto input = getInput(Inputs);
366  auto output = getOutput(Outputs);
367  const TensorShape& inputShape = input->getShape();
368  const TensorShape& outputShape = output->getShape();
369 
370  bool isNCHW = input->getShape().getLayout() == NCHW;
371  bool useTreeMax = (poolingRowSize <= 3 && poolingRowSize == poolingColSize);
372  auto func = isNCHW ? (useTreeMax ? ref_max_pooling_nchw_treemax
374  : (useTreeMax ? ref_max_pooling_nhwc_treemax
376  int poolRowSize, poolColSize, poolRowStride, poolColStride;
377  std::tie(poolRowSize, poolColSize) = getPoolingSize();
378  std::tie(poolRowStride, poolColStride) = getPoolingStride();
379 
380  float* inputData = input->data<float>();
381  float* outputData = output->data<float>();
382  mapArrayToAccel(ref::kPoolingHw, "input", inputData,
383  inputShape.storageSize() * sizeof(float));
384  mapArrayToAccel(ref::kPoolingHw, "result", outputData,
385  outputShape.storageSize() * sizeof(float));
386  int rowIdx = isNCHW ? 2 : 1;
387  int colIdx = isNCHW ? 3 : 2;
388  int chanIdx = isNCHW ? 1 : 3;
389  invokeKernel(ref::kPoolingHw, func, inputData, outputData, inputShape[0],
390  inputShape[chanIdx], inputShape[rowIdx], inputShape[colIdx],
391  inputShape.getPadding(3), outputShape[rowIdx],
392  outputShape[colIdx], outputShape.getPadding(3), poolRowSize,
393  poolColSize, poolRowStride, poolColStride);
394 }
395 
396 template <>
397 void AvgPoolingOp<ReferenceBackend>::run() {
398  auto input = getInput(Inputs);
399  auto output = getOutput(Outputs);
400  const TensorShape& inputShape = input->getShape();
401  const TensorShape& outputShape = output->getShape();
402 
403  bool isNCHW = input->getShape().getLayout() == NCHW;
404  auto func = isNCHW ? ref_avg_pooling_nchw : ref_avg_pooling_nhwc;
405  int poolRowSize, poolColSize, poolRowStride, poolColStride;
406  std::tie(poolRowSize, poolColSize) = getPoolingSize();
407  std::tie(poolRowStride, poolColStride) = getPoolingStride();
408 
409  float* inputData = input->data<float>();
410  float* outputData = output->data<float>();
411  mapArrayToAccel(ref::kPoolingHw, "input", inputData,
412  inputShape.storageSize() * sizeof(float));
413  mapArrayToAccel(ref::kPoolingHw, "result", outputData,
414  outputShape.storageSize() * sizeof(float));
415  int rowIdx = isNCHW ? 2 : 1;
416  int colIdx = isNCHW ? 3 : 2;
417  int chanIdx = isNCHW ? 1 : 3;
418  invokeKernel(ref::kPoolingHw, func, inputData, outputData, inputShape[0],
419  inputShape[chanIdx], inputShape[rowIdx], inputShape[colIdx],
420  inputShape.getPadding(3), outputShape[rowIdx],
421  outputShape[colIdx], outputShape.getPadding(3), poolRowSize,
422  poolColSize, poolRowStride, poolColStride);
423 }
424 
425 } // namespace smaug
426 
ref_max_pooling_nhwc_treemax
void ref_max_pooling_nhwc_treemax(float *input, float *result, int img_num, int img_chans, int img_rows, int img_cols, int img_pad, int res_rows, int res_cols, int res_pad, int pool_row_size, int pool_col_size, int pool_row_stride, int pool_col_stride)
Definition: ref_pooling_op.cpp:83
ref_avg_pooling_nchw
void ref_avg_pooling_nchw(float *input, float *result, int img_num, int img_chans, int img_rows, int img_cols, int img_pad, int res_rows, int res_cols, int res_pad, int pool_row_size, int pool_col_size, int pool_row_stride, int pool_col_stride)
Definition: ref_pooling_op.cpp:256
ref_max_pooling_nchw_treemax
void ref_max_pooling_nchw_treemax(float *input, float *result, int img_num, int img_chans, int img_rows, int img_cols, int img_pad, int res_rows, int res_cols, int res_pad, int pool_row_size, int pool_col_size, int pool_row_stride, int pool_col_stride)
Definition: ref_pooling_op.cpp:15
ref_avg_pooling_nhwc
void ref_avg_pooling_nhwc(float *input, float *result, int img_num, int img_chans, int img_rows, int img_cols, int img_pad, int res_rows, int res_cols, int res_pad, int pool_row_size, int pool_col_size, int pool_row_stride, int pool_col_stride)
Definition: ref_pooling_op.cpp:308
smaug
The smaug namespace is the parent namespace of all C++ code in SMAUG.
Definition: backend.cpp:38
common.h
Utilities for writing and invoking Aladdin kernels from Operators.
ref_max_pooling_nchw_itermax
void ref_max_pooling_nchw_itermax(float *input, float *result, int img_num, int img_chans, int img_rows, int img_cols, int img_pad, int res_rows, int res_cols, int res_pad, int pool_row_size, int pool_col_size, int pool_row_stride, int pool_col_stride)
Definition: ref_pooling_op.cpp:151
ref_max_pooling_nhwc_itermax
void ref_max_pooling_nhwc_itermax(float *input, float *result, int img_num, int img_chans, int img_rows, int img_cols, int img_pad, int res_rows, int res_cols, int res_pad, int pool_row_size, int pool_col_size, int pool_row_stride, int pool_col_stride)
Definition: ref_pooling_op.cpp:204
smaug::mapArrayToAccel
void mapArrayToAccel(unsigned reqCode, const char *arrayName, void *baseAddr, size_t size)
Maps an array of data to the accelerator.
Definition: common.cpp:12
smaug::invokeKernel
void invokeKernel(int accelIdx, unsigned reqCode, const Kernel &kernel, Args &&... args)
The generic blocking interface for all accelerator kernel functions.
Definition: common.h:72