5 #include "smaug/operators/smv/kernels/params.h"
37 float16* host_results,
50 int a_rows = inputs_dims[1];
51 int a_cols = inputs_dims[2];
52 int a_height = inputs_dims[3];
53 int a_pad = inputs_pad;
54 int inputs_size = inputs_dims[0] * a_rows * a_cols * (a_height + a_pad);
56 int results_rows = results_dims[1];
57 int results_cols = results_dims[2];
58 int results_height = results_dims[3];
59 int results_size = results_dims[0] * results_rows * results_cols *
60 (results_height + results_pad);
64 int end_row = a_rows - pool_rows + 1;
65 int end_col = a_cols - pool_cols + 1;
68 VEC_ARRAY_3D(
v8fp_t, _a, inputs, a_cols, a_height + a_pad);
73 results_height + results_pad);
80 int input_row_sample = end_row;
81 int input_col_sample = end_col;
82 int input_row_total_iters =
FRAC_CEIL(end_row, row_stride);
83 int input_col_total_iters =
FRAC_CEIL(end_col, col_stride);
84 int input_row_sample_iters = input_row_total_iters;
85 int input_col_sample_iters = input_col_total_iters;
86 int chan_grp_sample = chan_groups;
88 if (sampling->
level >= VeryHigh) {
89 input_row_sample_iters = min2(input_row_sample_iters, sample_num);
90 input_row_sample = input_row_sample_iters * row_stride;
91 input_col_sample_iters = min2(input_col_sample_iters, sample_num);
92 input_col_sample = input_col_sample_iters * col_stride;
93 chan_grp_sample = min2(chan_grp_sample, sample_num);
95 setSamplingFactor(
"maxpool_input_row",
96 input_row_total_iters * 1.0 / input_row_sample_iters);
97 setSamplingFactor(
"maxpool_input_col",
98 input_col_total_iters * 1.0 / input_col_sample_iters);
99 setSamplingFactor(
"maxpool_chan_grp", chan_groups * 1.0 / chan_grp_sample);
103 for (
int row = 0; row < input_row_sample; row += row_stride) {
106 for (
int col = 0; col < input_col_sample; col += col_stride) {
108 for (
int chan_grp = 0; chan_grp < chan_grp_sample; chan_grp++) {
110 -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX,
111 -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX
114 for (
int pool_i = 0; pool_i < pool_rows; pool_i++) {
116 for (
int pool_j = 0; pool_j < pool_cols; pool_j++) {
118 _a[row + pool_i][col + pool_j][chan_grp];
121 if (curr_results[px] < next_pixels[px])
122 curr_results[px] = next_pixels[px];
127 _results[out_row][out_col][ofmap_start_grp + chan_grp] =
136 if (ofmap_start + a_height == results_height)
167 float16* host_results,
180 int a_rows = inputs_dims[1];
181 int a_cols = inputs_dims[2];
182 int a_height = inputs_dims[3];
183 int a_pad = inputs_pad;
184 int inputs_size = inputs_dims[0] * a_rows * a_cols * (a_height + a_pad);
186 int results_rows = results_dims[1];
187 int results_cols = results_dims[2];
188 int results_height = results_dims[3];
189 int results_size = results_dims[0] * results_rows * results_cols *
190 (results_height + results_pad);
194 int end_row = a_rows - pool_rows + 1;
195 int end_col = a_cols - pool_cols + 1;
197 float scale = 1.0 / (pool_rows * pool_cols);
199 scale, scale, scale, scale, scale, scale, scale, scale
203 VEC_ARRAY_3D(
v8fp_t, _a, inputs, a_cols, a_height + a_pad);
208 results_height + results_pad);
215 int input_row_sample = end_row;
216 int input_col_sample = end_col;
217 int input_row_total_iters =
FRAC_CEIL(end_row, row_stride);
218 int input_col_total_iters =
FRAC_CEIL(end_col, col_stride);
219 int input_row_sample_iters = input_row_total_iters;
220 int input_col_sample_iters = input_col_total_iters;
221 int chan_grp_sample = chan_groups;
223 if (sampling->
level >= VeryHigh) {
224 input_row_sample_iters = min2(input_row_sample_iters, sample_num);
225 input_row_sample = input_row_sample_iters * row_stride;
226 input_col_sample_iters = min2(input_col_sample_iters, sample_num);
227 input_col_sample = input_col_sample_iters * col_stride;
228 chan_grp_sample = min2(chan_grp_sample, sample_num);
230 setSamplingFactor(
"avgpool_input_row",
231 input_row_total_iters * 1.0 / input_row_sample_iters);
232 setSamplingFactor(
"avgpool_input_col",
233 input_col_total_iters * 1.0 / input_col_sample_iters);
234 setSamplingFactor(
"avgpool_chan_grp", chan_groups * 1.0 / chan_grp_sample);
238 for (
int row = 0; row < input_row_sample; row += row_stride) {
241 for (
int col = 0; col < input_col_sample; col += col_stride) {
243 for (
int chan_grp = 0; chan_grp < chan_grp_sample; chan_grp++) {
244 v8fp_t curr_results = {0, 0, 0, 0, 0, 0, 0, 0};
246 for (
int pool_i = 0; pool_i < pool_rows; pool_i++) {
248 for (
int pool_j = 0; pool_j < pool_cols; pool_j++) {
250 _a[row + pool_i][col + pool_j][chan_grp];
254 _results[out_row][out_col][ofmap_start_grp + chan_grp] =
255 curr_results * scale_vec;
263 if (ofmap_start + a_height == results_height)