SMAUG
Simulating Machine Learning Applications on gem5-Aladdin
convolution_simd.c
1 #include <stdbool.h>
2 #include <stdio.h>
3 
5 #include "smaug/operators/smv/kernels/params.h"
7 #include "smaug/operators/smv/kernels/activation_functions_simd.h"
8 
9 #ifdef __cplusplus
10 extern "C" {
11 #endif
12 
53 void smv_conv3d_nhwc_vec_fxp(float16* host_inputs,
54  float16* host_weights,
55  float16* host_results,
56  float* inputs,
57  float* weights,
58  float* results,
59  int inputs_dims[4],
60  int weights_dims[4],
61  int results_dims[4],
62  int inputs_align_pad,
63  int weights_pad,
64  int results_pad,
65  int inputs_halo_pad[4],
66  int row_stride,
67  int col_stride,
68  int ifmap_start,
69  int kern_start,
70  bool accumulate,
71  bool read_inputs,
72  bool read_weights,
73  bool send_results,
74  activation_type act_function,
75  activation_param_t act_params,
76  SamplingInfo* sampling) {
77  int result_rows = results_dims[1];
78  int result_cols = results_dims[2];
79  int result_height = results_dims[3];
80  int results_size = results_dims[0] * result_rows * result_cols *
81  (result_height + results_pad);
82 
83  int k_rows = weights_dims[1];
84  int k_cols = weights_dims[2];
85  int k_height = weights_dims[3];
86  int k_pad = weights_pad;
87  int weights_size = weights_dims[0] * k_rows * k_cols * (k_height + k_pad);
88 
89  int a_rows = inputs_dims[1];
90  int a_cols = inputs_dims[2];
91  int a_height = inputs_dims[3];
92  int a_pad = inputs_align_pad;
93  int inputs_size = inputs_dims[0] * a_rows * a_cols * (a_height + a_pad);
94 
95  int top_pad = inputs_halo_pad[0];
96  int bottom_pad = inputs_halo_pad[1];
97  int left_pad = inputs_halo_pad[2];
98  int right_pad = inputs_halo_pad[3];
99  int end_row = a_rows + top_pad + bottom_pad - k_rows + 1;
100  int end_col = a_cols + left_pad + right_pad - k_cols + 1;
101 
102  int valid_row_end = a_rows - 1;
103  int valid_col_end = a_cols - 1;
104 
105  int in_row, in_col;
106  const int pe_depth = VECTOR_SIZE * NUM_MACC_INSTS;
107  const v8fp_t zero = { 0, 0, 0, 0, 0, 0, 0, 0 };
108 
109  // Kernels and input are in NHWC.
110  VEC_ARRAY_4D(v8fp_t, _kernels, weights, k_rows, k_cols, k_height + k_pad);
111  // TODO: Support input batches.
112  VEC_ARRAY_3D(v8fp_t, _a, inputs, a_cols, a_height + a_pad);
113  // Results in NHWC.
114  VEC_ARRAY_3D(
115  v8fp_t, _result, results, result_cols, result_height + results_pad);
116  int num_chan_blocks = (k_height - 1) / pe_depth;
117  // Number of effective kernels for this invocation. The weights can contain
118  // more kernels than the results buffer can fit the output feature maps,
119  // where the number of effective kernels will be the number of feature maps
120  // in the results.
121  int num_eff_kernels = min2(weights_dims[0], result_height);
122  int num_kernel_blocks = (num_eff_kernels - 1) / NUM_PE_INSTS;
123 
124  // Load inputs and weights if needed.
125  if (read_inputs)
126  host_load_fp16(inputs, host_inputs, inputs_size, 0, 0);
127  if (read_weights)
128  host_load_fp16(weights, host_weights, weights_size, 0, 0);
129 
130  // Set up the sample sizes and factors.
131  int pe_block_sample = num_kernel_blocks + 1;
132  int kern_row_sample = k_rows;
133  int kern_col_sample = k_cols;
134  int chan_block_sample = num_chan_blocks + 1;
135  int output_row_sample = end_row;
136  int output_col_sample = end_col;
137  int output_row_total_iters = FRAC_CEIL(end_row, row_stride);
138  int output_col_total_iters = FRAC_CEIL(end_col, col_stride);
139  int output_row_sample_iters = output_row_total_iters;
140  int output_col_sample_iters = output_col_total_iters;
141  int sample_num = sampling->num_sample_iterations;
142  if (sampling->level >= Low)
143  pe_block_sample = min2(pe_block_sample, sample_num);
144  if (sampling->level >= Medium) {
145  kern_row_sample = min2(kern_row_sample, sample_num);
146  kern_col_sample = min2(kern_col_sample, sample_num);
147  }
148  if (sampling->level >= High)
149  chan_block_sample = min2(chan_block_sample, sample_num);
150  if (sampling->level >= VeryHigh) {
151  output_row_sample_iters = min2(output_row_sample_iters, sample_num);
152  output_row_sample = output_row_sample_iters * row_stride;
153  // Pipelined loops need at minimum 2 sampled iterations.
154  output_col_sample_iters =
155  min2(output_col_sample_iters, max2(2, sample_num));
156  output_col_sample = output_col_sample_iters * col_stride;
157  }
158  setSamplingFactor("ofmap_block_iteration",
159  (num_kernel_blocks + 1) * 1.0 / pe_block_sample);
160  setSamplingFactor("k_row", k_rows * 1.0 / kern_row_sample);
161  setSamplingFactor("k_col", k_cols * 1.0 / kern_col_sample);
162  setSamplingFactor(
163  "pe_iteration", (num_chan_blocks + 1) * 1.0 / chan_block_sample);
164  setSamplingFactor("conv3d_row",
165  output_row_total_iters * 1.0 / output_row_sample_iters);
166  setSamplingFactor("conv3d_col",
167  output_col_total_iters * 1.0 / output_col_sample_iters);
168 
169  ofmap_block_iteration:
170  for (int ofmap_iters = 0; ofmap_iters < pe_block_sample;
171  ofmap_iters++) { // Result channel blocks
172  int ofmap_offset = ofmap_iters * NUM_PE_INSTS;
173  // If we have less than eight output channels, don't run the extra ones.
174  int kEffNumPeInsts = min2(result_height - ofmap_offset, NUM_PE_INSTS);
175  // Kernel rows
176  k_row:
177  for (int kern_row = 0; kern_row < kern_row_sample; kern_row++) {
178  k_col:
179  for (int kern_col = 0; kern_col < kern_col_sample;
180  kern_col++) { // Kernel cols
181  // This loops over all the input channels in groups of
182  // VECTOR_SIZE * NUM_MACC_INSTS.
183  pe_iteration:
184  for (int ifmap_iters = 0; ifmap_iters < chan_block_sample;
185  ifmap_iters++) {
186  bool start_from_zero = (!accumulate && kern_row == 0 &&
187  kern_col == 0 && ifmap_iters == 0);
188  int ifmap_offset = (ifmap_start + ifmap_iters * pe_depth) /
189  VECTOR_SIZE;
190  int kern_chan_offset =
191  (ifmap_iters * pe_depth) / VECTOR_SIZE;
192  int out_i = 0; // The result row.
193 
194  int max_ch_grp = NUM_MACC_INSTS;
195  // This is just computing the remaining groups of channels
196  // on the last iteration.
197  if (ifmap_iters == num_chan_blocks) {
198  max_ch_grp =
199  FRAC_CEIL((k_height - ifmap_iters * pe_depth),
200  VECTOR_SIZE);
201  }
202 
203  // Load in all the weights at once before beginning the
204  // input loop.
205  v8fp_t kernel_reg[NUM_PE_INSTS][NUM_MACC_INSTS] = {
206  { zero }, { zero }, { zero }, { zero },
207  { zero }, { zero }, { zero }, { zero }
208  };
209  const v8fp_t zero = (v8fp_t){ 0, 0, 0, 0, 0, 0, 0, 0 };
210  load_kern_pe:
211  for (int pe_id = 0; pe_id < kEffNumPeInsts; pe_id++) {
212  load_kern_mu:
213  for (int macc_idx = 0; macc_idx < NUM_MACC_INSTS;
214  macc_idx++) {
215  kernel_reg[pe_id][macc_idx] =
216  (macc_idx >= max_ch_grp)
217  ? zero
218  : _kernels[kern_start +
219  ofmap_offset + pe_id]
220  [kern_row][kern_col]
221  [kern_chan_offset +
222  macc_idx];
223  }
224  }
225 
226  conv3d_row:
227  for (int out_row = 0; out_row < output_row_sample;
228  out_row += row_stride) {
229  int out_j = 0; // The result col.
230 
231  // We buffer 8 (i.e., the number of PEs) partial sums
232  // into a vector register.
233  v8fp_t results_buffer;
234 
235  conv3d_col:
236  for (int out_col = 0; out_col < output_col_sample;
237  out_col += col_stride) {
238  // Local Regs. These should always be sized the same
239  // (so NUM_PE_INSTS, rather than kNumEffPeInsts).
240  v8fp_t smv_conv_product_reg[NUM_PE_INSTS]
241  [NUM_MACC_INSTS];
242  v8fp_t act_reg[NUM_MACC_INSTS];
243  results_buffer = start_from_zero
244  ? zero
245  : _result[out_i][out_j]
246  [ofmap_iters];
247  in_row = out_row - top_pad + kern_row;
248  in_col = out_col - left_pad + kern_col;
249  bool in_padding_row =
250  in_row < 0 || in_row > valid_row_end;
251  bool in_padding_col =
252  in_col < 0 || in_col > valid_col_end;
253 
254  // Load in the activations first, then broadcast
255  // them to all the PEs.
256  load_act_mu:
257  for (int macc_idx = 0; macc_idx < NUM_MACC_INSTS;
258  macc_idx++) {
259  bool is_padding = in_padding_row ||
260  in_padding_col ||
261  macc_idx >= max_ch_grp;
262  act_reg[macc_idx] =
263  (is_padding)
264  ? zero
265  : _a[in_row][in_col]
266  [ifmap_offset + macc_idx];
267  }
268 
269  v8fp_t accum_vec_reg[NUM_PE_INSTS] = {
270  zero, zero, zero, zero, zero, zero, zero, zero
271  };
272  float accum_reg[NUM_PE_INSTS] = { 0, 0, 0, 0,
273  0, 0, 0, 0 };
274  pe_groups:
275  for (int pe_id = 0; pe_id < kEffNumPeInsts;
276  pe_id++) {
277  mu_groups:
278  for (int macc_idx = 0;
279  macc_idx < NUM_MACC_INSTS;
280  macc_idx++) {
281  smv_conv_product_reg[pe_id][macc_idx] =
282  kernel_reg[pe_id][macc_idx] *
283  act_reg[macc_idx];
284  }
285  reduction_1:
286  for (int macc_idx = 0;
287  macc_idx < NUM_MACC_INSTS;
288  macc_idx++) {
289  accum_vec_reg[pe_id] +=
290  smv_conv_product_reg[pe_id]
291  [macc_idx];
292  }
293  reduction_2:
294  for (int vec_i = 0; vec_i < VECTOR_SIZE;
295  vec_i++) {
296  accum_reg[pe_id] +=
297  accum_vec_reg[pe_id][vec_i];
298  }
299  results_buffer[pe_id] += accum_reg[pe_id];
300  }
301 
302  // Write the results back to scratchpad.
303  _result[out_i][out_j][ofmap_iters] = results_buffer;
304  out_j++;
305  }
306  out_i++;
307  out_j = 0;
308  }
309  }
310  }
311  }
312  }
313  // Only run activation functions when the results are finished.
314  if (act_function != NO_ACTIVATION && send_results) {
315  activation_fun_vec(
316  results, results, results_size, act_function, act_params);
317  }
318  // Store results to the host memory if needed.
319  if (send_results)
320  host_store_fp16(results, host_results, results_size, 0, 0);
321 }
322 
323 #ifdef __cplusplus
324 } // extern "C"
325 #endif
activation_type
enum _activation_type activation_type
The activation function to apply to an operator's output in hardware.
_SamplingInfo
Simulation sampling information maintained by the Operator and passed to the accelerated kernel.
Definition: common.h:262
host_store_fp16
void host_store_fp16(float *local_data, float16 *remote_data, int num_elems, int local_offset, int remote_offset)
Definition: load_store_fp16_data.c:45
host_load_fp16
void host_load_fp16(float *local_data, float16 *remote_data, int num_elems, int local_offset, int remote_offset)
Definition: load_store_fp16_data.c:7
FRAC_CEIL
#define FRAC_CEIL(A, B)
Implements the ceiling function of A/B.
Definition: common.h:505
_SamplingInfo::num_sample_iterations
int num_sample_iterations
The requested number of iterations to run a sampled loop.
Definition: common.h:269
_activation_param_t
Parameters to the activation function hardware.
Definition: common.h:194
smv_conv3d_nhwc_vec_fxp
void smv_conv3d_nhwc_vec_fxp(float16 *host_inputs, float16 *host_weights, float16 *host_results, float *inputs, float *weights, float *results, int inputs_dims[4], int weights_dims[4], int results_dims[4], int inputs_align_pad, int weights_pad, int results_pad, int inputs_halo_pad[4], int row_stride, int col_stride, int ifmap_start, int kern_start, bool accumulate, bool read_inputs, bool read_weights, bool send_results, activation_type act_function, activation_param_t act_params, SamplingInfo *sampling)
Definition: convolution_simd.c:53
v8fp_t
fp_t v8fp_t
8 packed 32-bit floating point values.
Definition: common.h:301
_SamplingInfo::level
SamplingLevel level
Qualitative level of sampling.
Definition: common.h:264
load_store_fp16_data.h
Aladdin kernels to load/store FP16 data to/from host memory.
common.h
Utilities for writing and invoking Aladdin kernels from Operators.
VECTOR_SIZE
#define VECTOR_SIZE
Vector size used in SMV backends.
Definition: common.h:293