SMAUG
Simulating Machine Learning Applications on gem5-Aladdin
pooling.c
1 #include <stdio.h>
2 #include <float.h>
3 
5 #include "smaug/operators/smv/kernels/params.h"
7 
8 #ifdef __cplusplus
9 extern "C" {
10 #endif
11 
36 void smv_maxpooling_nhwc_vec_fxp(float16* host_inputs,
37  float16* host_results,
38  float* inputs,
39  float* results,
40  int inputs_dims[4],
41  int results_dims[4],
42  int inputs_pad,
43  int results_pad,
44  int pool_rows,
45  int pool_cols,
46  int row_stride,
47  int col_stride,
48  int ofmap_start,
49  SamplingInfo* sampling) {
50  int a_rows = inputs_dims[1];
51  int a_cols = inputs_dims[2];
52  int a_height = inputs_dims[3];
53  int a_pad = inputs_pad;
54  int inputs_size = inputs_dims[0] * a_rows * a_cols * (a_height + a_pad);
55 
56  int results_rows = results_dims[1];
57  int results_cols = results_dims[2];
58  int results_height = results_dims[3];
59  int results_size = results_dims[0] * results_rows * results_cols *
60  (results_height + results_pad);
61 
62  int chan_groups = FRAC_CEIL(a_height, VECTOR_SIZE);
63  int ofmap_start_grp = ofmap_start / VECTOR_SIZE;
64  int end_row = a_rows - pool_rows + 1;
65  int end_col = a_cols - pool_cols + 1;
66 
67  // TODO: Support input batches.
68  VEC_ARRAY_3D(v8fp_t, _a, inputs, a_cols, a_height + a_pad);
69  VEC_ARRAY_3D(v8fp_t,
70  _results,
71  results,
72  results_cols,
73  results_height + results_pad);
74 
75  // Load inputs.
76  host_load_fp16(inputs, host_inputs, inputs_size, 0, 0);
77 
78  // We sample on the pooling kernel only if the highest sampling level is
79  // used.
80  int input_row_sample = end_row;
81  int input_col_sample = end_col;
82  int input_row_total_iters = FRAC_CEIL(end_row, row_stride);
83  int input_col_total_iters = FRAC_CEIL(end_col, col_stride);
84  int input_row_sample_iters = input_row_total_iters;
85  int input_col_sample_iters = input_col_total_iters;
86  int chan_grp_sample = chan_groups;
87  int sample_num = sampling->num_sample_iterations;
88  if (sampling->level >= VeryHigh) {
89  input_row_sample_iters = min2(input_row_sample_iters, sample_num);
90  input_row_sample = input_row_sample_iters * row_stride;
91  input_col_sample_iters = min2(input_col_sample_iters, sample_num);
92  input_col_sample = input_col_sample_iters * col_stride;
93  chan_grp_sample = min2(chan_grp_sample, sample_num);
94  }
95  setSamplingFactor("maxpool_input_row",
96  input_row_total_iters * 1.0 / input_row_sample_iters);
97  setSamplingFactor("maxpool_input_col",
98  input_col_total_iters * 1.0 / input_col_sample_iters);
99  setSamplingFactor("maxpool_chan_grp", chan_groups * 1.0 / chan_grp_sample);
100 
101  int out_row = 0;
102  maxpool_input_row:
103  for (int row = 0; row < input_row_sample; row += row_stride) {
104  int out_col = 0;
105  maxpool_input_col:
106  for (int col = 0; col < input_col_sample; col += col_stride) {
107  maxpool_chan_grp:
108  for (int chan_grp = 0; chan_grp < chan_grp_sample; chan_grp++) {
109  v8fp_t curr_results = {
110  -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX,
111  -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX
112  };
113  maxpool_pool_row:
114  for (int pool_i = 0; pool_i < pool_rows; pool_i++) {
115  maxpool_pool_col:
116  for (int pool_j = 0; pool_j < pool_cols; pool_j++) {
117  v8fp_t next_pixels =
118  _a[row + pool_i][col + pool_j][chan_grp];
119  maxpool_compare:
120  for (int px = 0; px < VECTOR_SIZE; px++) {
121  if (curr_results[px] < next_pixels[px])
122  curr_results[px] = next_pixels[px];
123  }
124  }
125  }
126  // Commit.
127  _results[out_row][out_col][ofmap_start_grp + chan_grp] =
128  curr_results;
129  }
130  out_col++;
131  }
132  out_row++;
133  }
134 
135  // Store results to the host memory if needed.
136  if (ofmap_start + a_height == results_height)
137  host_store_fp16(results, host_results, results_size, 0, 0);
138 }
139 
166 void smv_avgpooling_nhwc_vec_fxp(float16* host_inputs,
167  float16* host_results,
168  float* inputs,
169  float* results,
170  int inputs_dims[4],
171  int results_dims[4],
172  int inputs_pad,
173  int results_pad,
174  int pool_rows,
175  int pool_cols,
176  int row_stride,
177  int col_stride,
178  int ofmap_start,
179  SamplingInfo* sampling) {
180  int a_rows = inputs_dims[1];
181  int a_cols = inputs_dims[2];
182  int a_height = inputs_dims[3];
183  int a_pad = inputs_pad;
184  int inputs_size = inputs_dims[0] * a_rows * a_cols * (a_height + a_pad);
185 
186  int results_rows = results_dims[1];
187  int results_cols = results_dims[2];
188  int results_height = results_dims[3];
189  int results_size = results_dims[0] * results_rows * results_cols *
190  (results_height + results_pad);
191 
192  int chan_groups = FRAC_CEIL(a_height, VECTOR_SIZE);
193  int ofmap_start_grp = ofmap_start / VECTOR_SIZE;
194  int end_row = a_rows - pool_rows + 1;
195  int end_col = a_cols - pool_cols + 1;
196 
197  float scale = 1.0 / (pool_rows * pool_cols);
198  v8fp_t scale_vec = {
199  scale, scale, scale, scale, scale, scale, scale, scale
200  };
201 
202  // TODO: Support input batches.
203  VEC_ARRAY_3D(v8fp_t, _a, inputs, a_cols, a_height + a_pad);
204  VEC_ARRAY_3D(v8fp_t,
205  _results,
206  results,
207  results_cols,
208  results_height + results_pad);
209 
210  // Load inputs.
211  host_load_fp16(inputs, host_inputs, inputs_size, 0, 0);
212 
213  // We sample on the pooling kernel only if the highest sampling level is
214  // used.
215  int input_row_sample = end_row;
216  int input_col_sample = end_col;
217  int input_row_total_iters = FRAC_CEIL(end_row, row_stride);
218  int input_col_total_iters = FRAC_CEIL(end_col, col_stride);
219  int input_row_sample_iters = input_row_total_iters;
220  int input_col_sample_iters = input_col_total_iters;
221  int chan_grp_sample = chan_groups;
222  int sample_num = sampling->num_sample_iterations;
223  if (sampling->level >= VeryHigh) {
224  input_row_sample_iters = min2(input_row_sample_iters, sample_num);
225  input_row_sample = input_row_sample_iters * row_stride;
226  input_col_sample_iters = min2(input_col_sample_iters, sample_num);
227  input_col_sample = input_col_sample_iters * col_stride;
228  chan_grp_sample = min2(chan_grp_sample, sample_num);
229  }
230  setSamplingFactor("avgpool_input_row",
231  input_row_total_iters * 1.0 / input_row_sample_iters);
232  setSamplingFactor("avgpool_input_col",
233  input_col_total_iters * 1.0 / input_col_sample_iters);
234  setSamplingFactor("avgpool_chan_grp", chan_groups * 1.0 / chan_grp_sample);
235 
236  int out_row = 0;
237  avgpool_input_row:
238  for (int row = 0; row < input_row_sample; row += row_stride) {
239  int out_col = 0;
240  avgpool_input_col:
241  for (int col = 0; col < input_col_sample; col += col_stride) {
242  avgpool_chan_grp:
243  for (int chan_grp = 0; chan_grp < chan_grp_sample; chan_grp++) {
244  v8fp_t curr_results = {0, 0, 0, 0, 0, 0, 0, 0};
245  avgpool_pool_row:
246  for (int pool_i = 0; pool_i < pool_rows; pool_i++) {
247  avgpool_pool_col:
248  for (int pool_j = 0; pool_j < pool_cols; pool_j++) {
249  curr_results +=
250  _a[row + pool_i][col + pool_j][chan_grp];
251  }
252  }
253  // Commit.
254  _results[out_row][out_col][ofmap_start_grp + chan_grp] =
255  curr_results * scale_vec;
256  }
257  out_col++;
258  }
259  out_row++;
260  }
261 
262  // Store results to the host memory if needed.
263  if (ofmap_start + a_height == results_height)
264  host_store_fp16(results, host_results, results_size, 0, 0);
265 }
266 
267 #ifdef __cplusplus
268 } // extern "C"
269 #endif
_SamplingInfo
Simulation sampling information maintained by the Operator and passed to the accelerated kernel.
Definition: common.h:262
host_store_fp16
void host_store_fp16(float *local_data, float16 *remote_data, int num_elems, int local_offset, int remote_offset)
Definition: load_store_fp16_data.c:45
host_load_fp16
void host_load_fp16(float *local_data, float16 *remote_data, int num_elems, int local_offset, int remote_offset)
Definition: load_store_fp16_data.c:7
FRAC_CEIL
#define FRAC_CEIL(A, B)
Implements the ceiling function of A/B.
Definition: common.h:505
_SamplingInfo::num_sample_iterations
int num_sample_iterations
The requested number of iterations to run a sampled loop.
Definition: common.h:269
v8fp_t
fp_t v8fp_t
8 packed 32-bit floating point values.
Definition: common.h:301
_SamplingInfo::level
SamplingLevel level
Qualitative level of sampling.
Definition: common.h:264
load_store_fp16_data.h
Aladdin kernels to load/store FP16 data to/from host memory.
smv_maxpooling_nhwc_vec_fxp
void smv_maxpooling_nhwc_vec_fxp(float16 *host_inputs, float16 *host_results, float *inputs, float *results, int inputs_dims[4], int results_dims[4], int inputs_pad, int results_pad, int pool_rows, int pool_cols, int row_stride, int col_stride, int ofmap_start, SamplingInfo *sampling)
Definition: pooling.c:36
common.h
Utilities for writing and invoking Aladdin kernels from Operators.
VECTOR_SIZE
#define VECTOR_SIZE
Vector size used in SMV backends.
Definition: common.h:293
smv_avgpooling_nhwc_vec_fxp
void smv_avgpooling_nhwc_vec_fxp(float16 *host_inputs, float16 *host_results, float *inputs, float *results, int inputs_dims[4], int results_dims[4], int inputs_pad, int results_pad, int pool_rows, int pool_cols, int row_stride, int col_stride, int ofmap_start, SamplingInfo *sampling)
Definition: pooling.c:166