SMAUG
Simulating Machine Learning Applications on gem5-Aladdin
batch_norm.c
1 #include <assert.h>
2 #include <stdio.h>
3 
5 #include "smaug/operators/smv/kernels/params.h"
7 #include "smaug/operators/smv/kernels/activation_functions_simd.h"
8 
9 #ifdef __cplusplus
10 extern "C" {
11 #endif
12 
26  v8fp_t mean,
27  v8fp_t recip_sqrt_var,
28  v8fp_t gamma,
29  v8fp_t beta) {
30  v8fp_t scale = recip_sqrt_var * gamma;
31  v8fp_t shift = input - mean;
32  return shift * scale + beta;
33 }
34 
41 void smv_batch_norm_post_fc_nc_vec_fxp(float16* host_inputs,
42  float16* host_weights,
43  float16* host_results,
44  float* inputs,
45  float* weights,
46  float* results,
47  int inputs_dims[2],
48  int weights_acts,
49  int inputs_pad,
50  int inputs_start,
51  int send_results,
52  activation_type act_function,
53  activation_param_t act_params) {
54  int inputs_nums = inputs_dims[0];
55  int inputs_acts = inputs_dims[1];
56  int inputs_size = inputs_nums * (inputs_acts + inputs_pad);
57  int weights_size = 4 * (weights_acts + inputs_pad);
58  int results_size = inputs_size;
59  int inputs_start_vec = inputs_start / VECTOR_SIZE;
60 
61  // Load inputs and weights if needed.
62  if (inputs_start == 0)
63  host_load_fp16(inputs, host_inputs, inputs_size, 0, 0);
64  host_load_fp16(weights, host_weights, weights_size, 0, 0);
65 
66  VEC_ARRAY_2D(v8fp_t, _inputs, inputs, inputs_size + inputs_pad);
67  VEC_ARRAY_2D(v8fp_t, _weights, weights, weights_acts + inputs_pad);
68  VEC_ARRAY_2D(v8fp_t, _results, results, inputs_size + inputs_pad);
69 
70  bn_batch:
71  for (int i = 0; i < inputs_nums; i++) {
72  bn_input:
73  for (int j = 0; j < weights_acts / VECTOR_SIZE; j++) {
74  _results[i][j + inputs_start_vec] =
75  batch_norm_simd_op(_inputs[i][j + inputs_start_vec],
76  _weights[0][j],
77  _weights[1][j],
78  _weights[2][j],
79  _weights[3][j]);
80  }
81  }
82  // Only run activation functions when the results are finished.
83  if (act_function != NO_ACTIVATION && send_results) {
84  activation_fun_vec(
85  results, results, results_size, act_function, act_params);
86  }
87  // Store results to the host memory if needed.
88  if (send_results)
89  host_store_fp16(results, host_results, results_size, 0, 0);
90 }
91 
100 void smv_batch_norm_post_conv_nchw_vec_fxp(float16* host_inputs,
101  float16* host_weights,
102  float16* host_results,
103  float* inputs,
104  float* weights,
105  float* results,
106  int inputs_dims[4],
107  int weights_chans,
108  int inputs_pad,
109  int weights_pad,
110  int weights_start,
111  activation_type act_function,
112  activation_param_t act_params) {
113  int inputs_nums = inputs_dims[0];
114  int inputs_chans = inputs_dims[1];
115  int inputs_rows = inputs_dims[2];
116  int inputs_cols = inputs_dims[3];
117  int inputs_size = inputs_nums * inputs_chans * inputs_rows *
118  (inputs_cols + inputs_pad);
119  int weights_size = 4 * (weights_chans + weights_pad);
120  int results_size = inputs_size;
121  int weights_start_vec = weights_start / VECTOR_SIZE;
122 
123  // Load inputs and weights if needed.
124  host_load_fp16(inputs, host_inputs, inputs_size, 0, 0);
125  if (weights_start == 0)
126  host_load_fp16(weights, host_weights, weights_size, 0, 0);
127 
128  VEC_ARRAY_4D(v8fp_t,
129  _inputs,
130  inputs,
131  inputs_chans,
132  inputs_rows,
133  inputs_cols + inputs_pad);
134  VEC_ARRAY_2D(v8fp_t, _weights, weights, weights_chans + weights_pad);
135  VEC_ARRAY_4D(v8fp_t,
136  _results,
137  results,
138  inputs_chans,
139  inputs_rows,
140  inputs_cols + inputs_pad);
141 
142  bn_batch:
143  for (int i = 0; i < inputs_nums; i++) {
144  bn_chan:
145  for (int h = 0; h < FRAC_CEIL(inputs_chans, VECTOR_SIZE); h++) {
146  bn_chan_vec:
147  for (int v = 0; v < VECTOR_SIZE; v++) {
148  float mean = _weights[0][h + weights_start_vec][v];
149  float recip_sqrt_var = _weights[1][h + weights_start_vec][v];
150  float gamma = _weights[2][h + weights_start_vec][v];
151  float beta = _weights[3][h + weights_start_vec][v];
152  v8fp_t mean_vec = { mean, mean, mean, mean,
153  mean, mean, mean, mean };
154  v8fp_t recip_sqrt_var_vec = { recip_sqrt_var, recip_sqrt_var,
155  recip_sqrt_var, recip_sqrt_var,
156  recip_sqrt_var, recip_sqrt_var,
157  recip_sqrt_var, recip_sqrt_var };
158  v8fp_t gamma_vec = { gamma, gamma, gamma, gamma,
159  gamma, gamma, gamma, gamma };
160  v8fp_t beta_vec = { beta, beta, beta, beta,
161  beta, beta, beta, beta };
162 
163  int ofmap = h * VECTOR_SIZE + v;
164  bn_row:
165  for (int r = 0; r < inputs_rows; r++) {
166  bn_col:
167  for (int c = 0; c < FRAC_CEIL(inputs_cols, VECTOR_SIZE);
168  c++) {
169  _results[i][ofmap][r][c] =
170  batch_norm_simd_op(_inputs[i][ofmap][r][c],
171  mean_vec,
172  recip_sqrt_var_vec,
173  gamma_vec,
174  beta_vec);
175  }
176  }
177  }
178  }
179  }
180  if (act_function != NO_ACTIVATION) {
181  activation_fun_vec(
182  results, results, results_size, act_function, act_params);
183  }
184  // Store results to the host memory.
185  host_store_fp16(results, host_results, results_size, 0, 0);
186 }
187 
196 void smv_batch_norm_post_conv_nhwc_vec_fxp(float16* host_inputs,
197  float16* host_weights,
198  float16* host_results,
199  float* inputs,
200  float* weights,
201  float* results,
202  int inputs_dims[4],
203  int weights_chans,
204  int inputs_pad,
205  int weights_pad,
206  int weights_start,
207  activation_type act_function,
208  activation_param_t act_params,
209  SamplingInfo* sampling) {
210  int inputs_nums = inputs_dims[0];
211  int inputs_rows = inputs_dims[1];
212  int inputs_cols = inputs_dims[2];
213  int inputs_chans = inputs_dims[3];
214  int inputs_size = inputs_nums * inputs_rows * inputs_cols *
215  (inputs_chans + inputs_pad);
216  int weights_size = 4 * (weights_chans + weights_pad);
217  int results_size = inputs_size;
218  int weights_start_vec = weights_start / VECTOR_SIZE;
219  int inputs_chans_vec = FRAC_CEIL(inputs_chans, VECTOR_SIZE);
220 
221  // Load inputs and weights if needed.
222  host_load_fp16(inputs, host_inputs, inputs_size, 0, 0);
223  if (weights_start == 0)
224  host_load_fp16(weights, host_weights, weights_size, 0, 0);
225 
226  VEC_ARRAY_4D(v8fp_t, _inputs, inputs, inputs_rows, inputs_cols,
227  inputs_chans + inputs_pad);
228  VEC_ARRAY_2D(v8fp_t, _weights, weights, weights_chans + weights_pad);
229  VEC_ARRAY_4D(v8fp_t, _results, results, inputs_rows, inputs_cols,
230  inputs_chans + inputs_pad);
231 
232  // We sample on the bn kernel only if the highest sampling level is
233  // used.
234  int batch_sample = inputs_nums;
235  int chan_sample = inputs_chans_vec;
236  int row_sample = inputs_rows;
237  int col_sample = inputs_cols;
238  int sample_num = sampling->num_sample_iterations;
239  if (sampling->level >= VeryHigh) {
240  batch_sample = min2(batch_sample, sample_num);
241  chan_sample = min2(chan_sample, sample_num);
242  row_sample = min2(row_sample, sample_num);
243  col_sample = min2(col_sample, sample_num);
244  }
245  setSamplingFactor("bn_batch", inputs_nums * 1.0 / batch_sample);
246  setSamplingFactor("bn_chan", inputs_chans_vec * 1.0 / chan_sample);
247  setSamplingFactor("bn_row", inputs_rows * 1.0 / row_sample);
248  setSamplingFactor("bn_col", inputs_cols * 1.0 / col_sample);
249 
250  bn_batch:
251  for (int i = 0; i < batch_sample; i++) {
252  bn_chan:
253  for (int h = 0; h < chan_sample; h++) {
254  v8fp_t mean = _weights[0][h + weights_start_vec];
255  v8fp_t recip_sqrt_var = _weights[1][h + weights_start_vec];
256  v8fp_t gamma = _weights[2][h + weights_start_vec];
257  v8fp_t beta = _weights[3][h + weights_start_vec];
258  bn_row:
259  for (int r = 0; r < row_sample; r++) {
260  bn_col:
261  for (int c = 0; c < col_sample; c++) {
262  _results[i][r][c][h] =
263  batch_norm_simd_op(_inputs[i][r][c][h],
264  mean,
265  recip_sqrt_var,
266  gamma,
267  beta);
268  }
269  }
270  }
271  }
272  if (act_function != NO_ACTIVATION) {
273  activation_fun_vec(
274  results, results, results_size, act_function, act_params);
275  }
276  // Store results to the host memory.
277  host_store_fp16(results, host_results, results_size, 0, 0);
278 }
279 
280 #ifdef __cplusplus
281 } // extern "C"
282 #endif
smv_batch_norm_post_conv_nhwc_vec_fxp
void smv_batch_norm_post_conv_nhwc_vec_fxp(float16 *host_inputs, float16 *host_weights, float16 *host_results, float *inputs, float *weights, float *results, int inputs_dims[4], int weights_chans, int inputs_pad, int weights_pad, int weights_start, activation_type act_function, activation_param_t act_params, SamplingInfo *sampling)
Definition: batch_norm.c:196
smv_batch_norm_post_conv_nchw_vec_fxp
void smv_batch_norm_post_conv_nchw_vec_fxp(float16 *host_inputs, float16 *host_weights, float16 *host_results, float *inputs, float *weights, float *results, int inputs_dims[4], int weights_chans, int inputs_pad, int weights_pad, int weights_start, activation_type act_function, activation_param_t act_params)
Definition: batch_norm.c:100
activation_type
enum _activation_type activation_type
The activation function to apply to an operator's output in hardware.
_SamplingInfo
Simulation sampling information maintained by the Operator and passed to the accelerated kernel.
Definition: common.h:262
host_store_fp16
void host_store_fp16(float *local_data, float16 *remote_data, int num_elems, int local_offset, int remote_offset)
Definition: load_store_fp16_data.c:45
host_load_fp16
void host_load_fp16(float *local_data, float16 *remote_data, int num_elems, int local_offset, int remote_offset)
Definition: load_store_fp16_data.c:7
FRAC_CEIL
#define FRAC_CEIL(A, B)
Implements the ceiling function of A/B.
Definition: common.h:505
_SamplingInfo::num_sample_iterations
int num_sample_iterations
The requested number of iterations to run a sampled loop.
Definition: common.h:269
_activation_param_t
Parameters to the activation function hardware.
Definition: common.h:194
batch_norm_simd_op
v8fp_t batch_norm_simd_op(v8fp_t input, v8fp_t mean, v8fp_t recip_sqrt_var, v8fp_t gamma, v8fp_t beta)
Definition: batch_norm.c:25
v8fp_t
fp_t v8fp_t
8 packed 32-bit floating point values.
Definition: common.h:301
_SamplingInfo::level
SamplingLevel level
Qualitative level of sampling.
Definition: common.h:264
load_store_fp16_data.h
Aladdin kernels to load/store FP16 data to/from host memory.
smv_batch_norm_post_fc_nc_vec_fxp
void smv_batch_norm_post_fc_nc_vec_fxp(float16 *host_inputs, float16 *host_weights, float16 *host_results, float *inputs, float *weights, float *results, int inputs_dims[2], int weights_acts, int inputs_pad, int inputs_start, int send_results, activation_type act_function, activation_param_t act_params)
Definition: batch_norm.c:41
common.h
Utilities for writing and invoking Aladdin kernels from Operators.
VECTOR_SIZE
#define VECTOR_SIZE
Vector size used in SMV backends.
Definition: common.h:293
ALWAYS_INLINE
#define ALWAYS_INLINE
We have to disable all function inlining at the global level for Aladdin + LLVM-Tracer to work,...
Definition: common.h:521