3 #include "smaug/core/backend.h"
5 #include "smaug/operators/pooling_op.h"
28 int pool_col_stride) {
29 int total_pool_size = pool_row_size * pool_col_size;
30 int end_row = img_rows - pool_row_size + 1;
31 int end_col = img_cols - pool_col_size + 1;
32 float elems[total_pool_size];
35 ARRAY_4D(
float, _input, input, img_chans, img_rows, img_cols + img_pad);
36 ARRAY_4D(
float, _result, result, img_chans, res_rows, res_cols + res_pad);
39 for (
int img = 0; img < img_num; img++) {
41 for (
int h = 0; h < img_chans; h++) {
44 for (
int i = 0; i < end_row; i += pool_row_stride) {
47 for (
int j = 0; j < end_col; j += pool_col_stride) {
51 for (
int k = 0; k < pool_row_size; k++) {
53 for (
int l = 0; l < pool_col_size; l++) {
54 elems[elem_idx] = _input[img][h][i+k][j+l];
60 if (total_pool_size == 4)
61 curr_max = max4(elems[0], elems[1], elems[2], elems[3]);
62 else if (total_pool_size == 9)
63 curr_max = max9(elems[0], elems[1], elems[2], elems[3],
64 elems[4], elems[5], elems[6], elems[7],
67 assert(
false &&
"Unsupported pooling size!");
69 _result[img][h][oi][oj] = curr_max;
96 int pool_col_stride) {
97 int total_pool_size = pool_row_size * pool_col_size;
98 int end_row = img_rows - pool_row_size + 1;
99 int end_col = img_cols - pool_col_size + 1;
100 float elems[total_pool_size];
103 ARRAY_4D(
float, _input, input, img_rows, img_cols, img_chans + img_pad);
104 ARRAY_4D(
float, _result, result, res_rows, res_cols, img_chans + res_pad);
107 for (
int img = 0; img < img_num; img++) {
108 maxpool_input_height:
109 for (
int h = 0; h < img_chans; h++) {
112 for (
int i = 0; i < end_row; i += pool_row_stride) {
115 for (
int j = 0; j < end_col; j += pool_col_stride) {
119 for (
int k = 0; k < pool_row_size; k++) {
121 for (
int l = 0; l < pool_col_size; l++) {
122 elems[elem_idx] = _input[img][i+k][j+l][h];
128 if (total_pool_size == 4)
129 curr_max = max4(elems[0], elems[1], elems[2], elems[3]);
130 else if (total_pool_size == 9)
131 curr_max = max9(elems[0], elems[1], elems[2], elems[3],
132 elems[4], elems[5], elems[6], elems[7],
135 assert(
false &&
"Unsupported pooling size!");
137 _result[img][oi][oj][h] = curr_max;
164 int pool_col_stride) {
165 int end_row = img_rows - pool_row_size + 1;
166 int end_col = img_cols - pool_col_size + 1;
167 ARRAY_4D(
float, _input, input, img_chans, img_rows, img_cols + img_pad);
168 ARRAY_4D(
float, _result, result, img_chans, res_rows, res_cols + res_pad);
171 for (
int img = 0; img < img_num; img++) {
172 maxpool_input_height:
173 for (
int h = 0; h < img_chans; h++) {
176 for (
int i = 0; i < end_row; i += pool_row_stride) {
179 for (
int j = 0; j < end_col; j += pool_col_stride) {
180 float curr_max = -FLT_MAX;
182 for (
int k = 0; k < pool_row_size; k++) {
184 for (
int l = 0; l < pool_col_size; l++) {
185 float in_val = _input[img][h][i+k][j+l];
186 curr_max = max2(in_val, curr_max);
190 _result[img][h][oi][oj] = curr_max;
217 int pool_col_stride) {
218 int end_row = img_rows - pool_row_size + 1;
219 int end_col = img_cols - pool_col_size + 1;
220 ARRAY_4D(
float, _input, input, img_rows, img_cols, img_chans + img_pad);
221 ARRAY_4D(
float, _result, result, res_rows, res_cols, img_chans + res_pad);
224 for (
int img = 0; img < img_num; img++) {
225 maxpool_input_height:
226 for (
int h = 0; h < img_chans; h++) {
229 for (
int i = 0; i < end_row; i += pool_row_stride) {
232 for (
int j = 0; j < end_col; j += pool_col_stride) {
233 float curr_max = -FLT_MAX;
235 for (
int k = 0; k < pool_row_size; k++) {
237 for (
int l = 0; l < pool_col_size; l++) {
238 float in_val = _input[img][i+k][j+l][h];
239 curr_max = max2(in_val, curr_max);
243 _result[img][oi][oj][h] = curr_max;
269 int pool_col_stride) {
270 int end_row = img_rows - pool_row_size + 1;
271 int end_col = img_cols - pool_col_size + 1;
272 ARRAY_4D(
float, _input, input, img_chans, img_rows, img_cols + img_pad);
273 ARRAY_4D(
float, _result, result, img_chans, res_rows, res_cols + res_pad);
274 float recip_total_size = 1.0 / (pool_row_size * pool_col_size);
277 for (
int img = 0; img < img_num; img++) {
278 maxpool_input_height:
279 for (
int h = 0; h < img_chans; h++) {
282 for (
int i = 0; i < end_row; i += pool_row_stride) {
285 for (
int j = 0; j < end_col; j += pool_col_stride) {
288 for (
int k = 0; k < pool_row_size; k++) {
290 for (
int l = 0; l < pool_col_size; l++) {
291 curr_sum += _input[img][h][i+k][j+l];
295 _result[img][h][oi][oj] = curr_sum * recip_total_size;
321 int pool_col_stride) {
322 int end_row = img_rows - pool_row_size + 1;
323 int end_col = img_cols - pool_col_size + 1;
324 ARRAY_4D(
float, _input, input, img_rows, img_cols, img_chans + img_pad);
325 ARRAY_4D(
float, _result, result, res_rows, res_cols, img_chans + res_pad);
326 float recip_total_size = 1.0 / (pool_row_size * pool_col_size);
329 for (
int img = 0; img < img_num; img++) {
330 maxpool_input_height:
331 for (
int h = 0; h < img_chans; h++) {
334 for (
int i = 0; i < end_row; i += pool_row_stride) {
337 for (
int j = 0; j < end_col; j += pool_col_stride) {
340 for (
int k = 0; k < pool_row_size; k++) {
342 for (
int l = 0; l < pool_col_size; l++) {
343 curr_sum += _input[img][i+k][j+l][h];
347 _result[img][oi][oj][h] = curr_sum * recip_total_size;
364 void MaxPoolingOp<ReferenceBackend>::run() {
365 auto input = getInput(Inputs);
366 auto output = getOutput(Outputs);
367 const TensorShape& inputShape = input->getShape();
368 const TensorShape& outputShape = output->getShape();
370 bool isNCHW = input->getShape().getLayout() == NCHW;
371 bool useTreeMax = (poolingRowSize <= 3 && poolingRowSize == poolingColSize);
376 int poolRowSize, poolColSize, poolRowStride, poolColStride;
377 std::tie(poolRowSize, poolColSize) = getPoolingSize();
378 std::tie(poolRowStride, poolColStride) = getPoolingStride();
380 float* inputData = input->data<
float>();
381 float* outputData = output->data<
float>();
383 inputShape.storageSize() *
sizeof(
float));
385 outputShape.storageSize() *
sizeof(
float));
386 int rowIdx = isNCHW ? 2 : 1;
387 int colIdx = isNCHW ? 3 : 2;
388 int chanIdx = isNCHW ? 1 : 3;
389 invokeKernel(ref::kPoolingHw, func, inputData, outputData, inputShape[0],
390 inputShape[chanIdx], inputShape[rowIdx], inputShape[colIdx],
391 inputShape.getPadding(3), outputShape[rowIdx],
392 outputShape[colIdx], outputShape.getPadding(3), poolRowSize,
393 poolColSize, poolRowStride, poolColStride);
397 void AvgPoolingOp<ReferenceBackend>::run() {
398 auto input = getInput(Inputs);
399 auto output = getOutput(Outputs);
400 const TensorShape& inputShape = input->getShape();
401 const TensorShape& outputShape = output->getShape();
403 bool isNCHW = input->getShape().getLayout() == NCHW;
405 int poolRowSize, poolColSize, poolRowStride, poolColStride;
406 std::tie(poolRowSize, poolColSize) = getPoolingSize();
407 std::tie(poolRowStride, poolColStride) = getPoolingStride();
409 float* inputData = input->data<
float>();
410 float* outputData = output->data<
float>();
412 inputShape.storageSize() *
sizeof(
float));
414 outputShape.storageSize() *
sizeof(
float));
415 int rowIdx = isNCHW ? 2 : 1;
416 int colIdx = isNCHW ? 3 : 2;
417 int chanIdx = isNCHW ? 1 : 3;
418 invokeKernel(ref::kPoolingHw, func, inputData, outputData, inputShape[0],
419 inputShape[chanIdx], inputShape[rowIdx], inputShape[colIdx],
420 inputShape.getPadding(3), outputShape[rowIdx],
421 outputShape[colIdx], outputShape.getPadding(3), poolRowSize,
422 poolColSize, poolRowStride, poolColStride);