SMAUG
Simulating Machine Learning Applications on gem5-Aladdin
matrix_multiply.c
1 #include <assert.h>
2 #include <stdio.h>
3 
5 #include "smaug/operators/smv/kernels/params.h"
7 #include "smaug/operators/smv/kernels/activation_functions_simd.h"
8 
9 #ifdef __cplusplus
10 extern "C" {
11 #endif
12 
60  float16* host_b,
61  float16* host_results,
62  float* a,
63  float* b,
64  float* results,
65  int a_dims[2],
66  int b_dims[2],
67  int results_dims[2],
68  int a_pad,
69  int b_pad,
70  int results_pad,
71  int a_start,
72  int result_start,
73  bool accumulate,
74  bool read_inputs,
75  bool send_results,
76  activation_type act_function,
77  activation_param_t act_params,
78  SamplingInfo* sampling) {
79  int a_width = a_dims[1];
80  int a_height = a_dims[0];
81  int b_width = b_dims[1];
82  int b_height = b_dims[0];
83  int results_width = results_dims[1];
84  int results_height = results_dims[0];
85  ASSERT((b_width + b_pad) % VECTOR_SIZE == 0 &&
86  "Width of b must be a multiple of VECTOR_SIZE!");
87  int a_width_vec = (a_width + a_pad) / VECTOR_SIZE;
88  int b_width_vec = (b_width + b_pad) / VECTOR_SIZE;
89  int a_size = a_height * (a_width + a_pad);
90  int b_size = b_height * (b_width + b_pad);
91  int results_size = results_height * (results_width + results_pad);
92 
93  v8fp_t zero = (v8fp_t){ 0, 0, 0, 0, 0, 0, 0, 0 };
94  VEC_ARRAY_2D(v8fp_t, _a, a, a_width + a_pad);
95  VEC_ARRAY_2D(v8fp_t, _b, b, b_width + b_pad);
96  VEC_ARRAY_2D(v8fp_t, _results, results, results_width + results_pad);
97  v8fp_t partial_sums;
98 
99  // Load a and b if needed.
100  if (read_inputs)
101  host_load_fp16(a, host_a, a_size, 0, 0);
102  host_load_fp16(b, host_b, b_size, 0, 0);
103 
104  // We sample on the FC kernel only if the highest sampling level is used.
105  int b_col_sample = b_width_vec;
106  int b_col_total_iters = FRAC_CEIL(b_width_vec, NUM_MACC_INSTS);
107  int b_col_sample_iters = b_col_total_iters;
108  int sample_num = sampling->num_sample_iterations;
109  if (sampling->level >= VeryHigh) {
110  // Pipelined loops need at minimum 2 sampled iterations.
111  b_col_sample_iters = min2(b_col_sample_iters, max2(2, sample_num));
112  b_col_sample = b_col_sample_iters * NUM_MACC_INSTS;
113  }
114  setSamplingFactor("b_col", b_col_total_iters * 1.0 / b_col_sample_iters);
115 
116  a_act:
117  for (int a_act = 0; a_act < a_height; a_act++) {
118  b_row:
119  for (int b_row = 0; b_row < b_height; b_row += NUM_PE_INSTS) {
120  if (b_row % VECTOR_SIZE == 0) {
121  if (accumulate) {
122  partial_sums = _results[a_act][(result_start + b_row) /
123  VECTOR_SIZE];
124  } else {
125  partial_sums = zero;
126  }
127  }
128 
129  b_col:
130  for (int b_col = 0; b_col < b_col_sample; b_col += NUM_MACC_INSTS) {
131  // To work around an Aladdin dependence analysis bug where
132  // InsertElement operations on vector types can be
133  // serialized across unrolled loop iterations, we use a
134  // normal scalar array here instead. Prior to committing the
135  // data to the scratchpad, we'll copy this data back to a
136  // vector register.
137  float partial_sums_inner[VECTOR_SIZE] = {
138  0, 0, 0, 0, 0, 0, 0, 0
139  };
140 
141  v8fp_t a_reg[NUM_MACC_INSTS];
142  a_reg_load:
143  for (int a_vec = 0; a_vec < NUM_MACC_INSTS; a_vec++) {
144  int a_col = a_start / VECTOR_SIZE + b_col + a_vec;
145  a_reg[a_vec] =
146  a_col >= a_width_vec ? zero : _a[a_act][a_col];
147  }
148 
149  pe_insts:
150  for (int pe_id = 0; pe_id < NUM_PE_INSTS; pe_id++) {
151  v8fp_t b_reg[NUM_MACC_INSTS];
152  b_reg_load:
153  for (int macc_idx = 0; macc_idx < NUM_MACC_INSTS;
154  macc_idx++) {
155  int pe_row = b_row + pe_id;
156  int this_b_col = b_col + macc_idx;
157  b_reg[macc_idx] =
158  (pe_row >= b_height ||
159  this_b_col >= b_width_vec)
160  ? zero
161  : _b[pe_row][b_col + macc_idx];
162  }
163 
164  v8fp_t product_reg[NUM_MACC_INSTS];
165  core_mul:
166  for (int macc_idx = 0; macc_idx < NUM_MACC_INSTS;
167  macc_idx++) {
168  product_reg[macc_idx] =
169  a_reg[macc_idx] * b_reg[macc_idx];
170  }
171 
172  v8fp_t accum_vec_reg = zero;
173  reduce_1:
174  for (int macc_idx = 0; macc_idx < NUM_MACC_INSTS;
175  macc_idx++) {
176  accum_vec_reg += product_reg[macc_idx];
177  }
178 
179  float accum_reg = 0;
180  reduce_2:
181  for (int vec_i = 0; vec_i < VECTOR_SIZE; vec_i++) {
182  accum_reg += accum_vec_reg[vec_i];
183  }
184  partial_sums_inner[pe_id] += accum_reg;
185  }
186  copy_psums:
187  for (int i = 0; i < NUM_PE_INSTS; i++) {
188  partial_sums[i] += partial_sums_inner[i];
189  }
190  }
191 
192  int next_b_row = b_row + NUM_PE_INSTS;
193  if (next_b_row % VECTOR_SIZE == 0 || next_b_row >= b_height) {
194  _results[a_act][(result_start + b_row) / VECTOR_SIZE] =
195  partial_sums;
196  }
197  }
198  }
199  // Only run activation functions when the results are finished.
200  if (act_function != NO_ACTIVATION && send_results) {
201  activation_fun_vec(
202  results, results, results_size, act_function, act_params);
203  }
204  // Store results to the host memory if needed.
205  if (send_results)
206  host_store_fp16(results, host_results, results_size, 0, 0);
207 }
208 
209 #ifdef __cplusplus
210 } // extern "C"
211 #endif
activation_type
enum _activation_type activation_type
The activation function to apply to an operator's output in hardware.
_SamplingInfo
Simulation sampling information maintained by the Operator and passed to the accelerated kernel.
Definition: common.h:262
host_store_fp16
void host_store_fp16(float *local_data, float16 *remote_data, int num_elems, int local_offset, int remote_offset)
Definition: load_store_fp16_data.c:45
host_load_fp16
void host_load_fp16(float *local_data, float16 *remote_data, int num_elems, int local_offset, int remote_offset)
Definition: load_store_fp16_data.c:7
FRAC_CEIL
#define FRAC_CEIL(A, B)
Implements the ceiling function of A/B.
Definition: common.h:505
_SamplingInfo::num_sample_iterations
int num_sample_iterations
The requested number of iterations to run a sampled loop.
Definition: common.h:269
smv_matrix_multiply_transpose_nc_vec_fxp
void smv_matrix_multiply_transpose_nc_vec_fxp(float16 *host_a, float16 *host_b, float16 *host_results, float *a, float *b, float *results, int a_dims[2], int b_dims[2], int results_dims[2], int a_pad, int b_pad, int results_pad, int a_start, int result_start, bool accumulate, bool read_inputs, bool send_results, activation_type act_function, activation_param_t act_params, SamplingInfo *sampling)
Definition: matrix_multiply.c:59
_activation_param_t
Parameters to the activation function hardware.
Definition: common.h:194
v8fp_t
fp_t v8fp_t
8 packed 32-bit floating point values.
Definition: common.h:301
_SamplingInfo::level
SamplingLevel level
Qualitative level of sampling.
Definition: common.h:264
load_store_fp16_data.h
Aladdin kernels to load/store FP16 data to/from host memory.
common.h
Utilities for writing and invoking Aladdin kernels from Operators.
VECTOR_SIZE
#define VECTOR_SIZE
Vector size used in SMV backends.
Definition: common.h:293
ASSERT
#define ASSERT(x)
An assertion macro which disables asserts in LLVM-Tracer instrumented code.
Definition: common.h:530