SMAUG
Simulating Machine Learning Applications on gem5-Aladdin
|
Kernel functions meant to be run under Aladdin. More...
Functions | |
float | batch_norm_op (float input, float mean, float recip_sqrt_var, float gamma, float beta) |
void | ref_batch_norm_post_fc (float *inputs, float *mean, float *variance, float *gamma, float *beta, float *result, int input_nums, int input_size, int input_pad, activation_type act_function, activation_param_t act_params) |
void | ref_batch_norm_nchw_post_conv (float *inputs, float *mean, float *variance, float *gamma, float *beta, float *result, int img_nums, int img_chans, int img_rows, int img_cols, int img_pad, int wgt_pad, activation_type act_function, activation_param_t act_params) |
void | ref_batch_norm_nhwc_post_conv (float *inputs, float *mean, float *variance, float *gamma, float *beta, float *result, int img_nums, int img_rows, int img_cols, int img_chans, int img_pad, int wgt_pad, activation_type act_function, activation_param_t act_params) |
void | ref_conv3d_nchw_valid_padding (float *input, float *kernels, float *result, int img_num, int img_chans, int img_rows, int img_cols, int img_pad, int k_num, int k_rows, int k_cols, int k_pad, int k_row_stride, int k_col_stride, int res_rows, int res_cols, int res_pad, activation_type act_function, activation_param_t act_params) |
void | ref_conv3d_nchw_same_padding (float *input, float *kernels, float *result, int img_num, int img_chans, int img_rows, int img_cols, int img_pad, int k_num, int k_rows, int k_cols, int k_pad, int k_row_stride, int k_col_stride, int res_rows, int res_cols, int res_pad, activation_type act_function, activation_param_t act_params) |
void | ref_conv3d_nhwc_valid_padding (float *input, float *kernels, float *result, int img_num, int img_chans, int img_rows, int img_cols, int img_pad, int k_num, int k_rows, int k_cols, int k_pad, int k_row_stride, int k_col_stride, int res_rows, int res_cols, int res_pad, activation_type act_function, activation_param_t act_params) |
void | ref_conv3d_nhwc_same_padding (float *input, float *kernels, float *result, int img_num, int img_chans, int img_rows, int img_cols, int img_pad, int k_num, int k_rows, int k_cols, int k_pad, int k_row_stride, int k_col_stride, int res_rows, int res_cols, int res_pad, activation_type act_function, activation_param_t act_params) |
void | ref_conv2d_nchw_valid_padding (float *input, float *kernels, float *result, int img_num, int img_chans, int img_rows, int img_cols, int img_pad, int k_rows, int k_cols, int k_pad, int k_row_stride, int k_col_stride, int res_rows, int res_cols, int res_pad) |
void | ref_conv2d_nchw_same_padding (float *input, float *kernels, float *result, int img_num, int img_chans, int img_rows, int img_cols, int img_pad, int k_rows, int k_cols, int k_pad, int k_row_stride, int k_col_stride, int res_rows, int res_cols, int res_pad) |
void | ref_eltwise_add (float *input0, float *input1, float *results, int input_size) |
void | ref_eltwise_mul (float *input0, float *input1, float *results, int input_size) |
void | ref_greater (float *input0, float *input1, bool *results, int input_size) |
void | ref_greater_equal (float *input0, float *input1, bool *results, int input_size) |
void | ref_inner_product_ab_times_bc (float *a, float *b, float *c, int a_height, int a_width, int b_width, int a_pad, int b_pad, int c_pad, activation_type act_function, activation_param_t act_params) |
void | ref_inner_product_ab_times_cb (float *a, float *b, float *c, int a_height, int b_width, int b_height, int a_pad, int b_pad, int c_pad, activation_type act_function, activation_param_t act_params) |
void | ref_less (float *input0, float *input1, bool *results, int input_size) |
void | ref_less_equal (float *input0, float *input1, bool *results, int input_size) |
void | ref_max_pooling_nchw_treemax (float *input, float *result, int img_num, int img_chans, int img_rows, int img_cols, int img_pad, int res_rows, int res_cols, int res_pad, int pool_row_size, int pool_col_size, int pool_row_stride, int pool_col_stride) |
void | ref_max_pooling_nhwc_treemax (float *input, float *result, int img_num, int img_chans, int img_rows, int img_cols, int img_pad, int res_rows, int res_cols, int res_pad, int pool_row_size, int pool_col_size, int pool_row_stride, int pool_col_stride) |
void | ref_max_pooling_nchw_itermax (float *input, float *result, int img_num, int img_chans, int img_rows, int img_cols, int img_pad, int res_rows, int res_cols, int res_pad, int pool_row_size, int pool_col_size, int pool_row_stride, int pool_col_stride) |
void | ref_max_pooling_nhwc_itermax (float *input, float *result, int img_num, int img_chans, int img_rows, int img_cols, int img_pad, int res_rows, int res_cols, int res_pad, int pool_row_size, int pool_col_size, int pool_row_stride, int pool_col_stride) |
void | ref_avg_pooling_nchw (float *input, float *result, int img_num, int img_chans, int img_rows, int img_cols, int img_pad, int res_rows, int res_cols, int res_pad, int pool_row_size, int pool_col_size, int pool_row_stride, int pool_col_stride) |
void | ref_avg_pooling_nhwc (float *input, float *result, int img_num, int img_chans, int img_rows, int img_cols, int img_pad, int res_rows, int res_cols, int res_pad, int pool_row_size, int pool_col_size, int pool_row_stride, int pool_col_stride) |
void | ref_softmax_nc (float *inputs, float *results, int input_num, int input_size, int input_pad) |
void | smv_activation_fun_nc_vec_fxp (float16 *host_inputs, float16 *host_results, float *inputs, float *results, int inputs_size, activation_type function, activation_param_t params) |
void | smv_softmax_nc_vec_fxp (float16 *host_inputs, float16 *host_results, float *inputs, float *results, int input_num, int input_size, int input_pad) |
v8fp_t | batch_norm_simd_op (v8fp_t input, v8fp_t mean, v8fp_t recip_sqrt_var, v8fp_t gamma, v8fp_t beta) |
void | smv_batch_norm_post_fc_nc_vec_fxp (float16 *host_inputs, float16 *host_weights, float16 *host_results, float *inputs, float *weights, float *results, int inputs_dims[2], int weights_acts, int inputs_pad, int inputs_start, int send_results, activation_type act_function, activation_param_t act_params) |
void | smv_batch_norm_post_conv_nchw_vec_fxp (float16 *host_inputs, float16 *host_weights, float16 *host_results, float *inputs, float *weights, float *results, int inputs_dims[4], int weights_chans, int inputs_pad, int weights_pad, int weights_start, activation_type act_function, activation_param_t act_params) |
void | smv_batch_norm_post_conv_nhwc_vec_fxp (float16 *host_inputs, float16 *host_weights, float16 *host_results, float *inputs, float *weights, float *results, int inputs_dims[4], int weights_chans, int inputs_pad, int weights_pad, int weights_start, activation_type act_function, activation_param_t act_params, SamplingInfo *sampling) |
void | smv_less_nc_vec_fxp (float16 *host_inputs0, float16 *host_inputs1, bool *host_results, float *inputs0, float *inputs1, bool *results, int inputs_size) |
void | smv_less_equal_nc_vec_fxp (float16 *host_inputs0, float16 *host_inputs1, bool *host_results, float *inputs0, float *inputs1, bool *results, int inputs_size) |
void | smv_greater_nc_vec_fxp (float16 *host_inputs0, float16 *host_inputs1, bool *host_results, float *inputs0, float *inputs1, bool *results, int inputs_size) |
void | smv_greater_equal_nc_vec_fxp (float16 *host_inputs0, float16 *host_inputs1, bool *host_results, float *inputs0, float *inputs1, bool *results, int inputs_size) |
void | smv_conv3d_nhwc_vec_fxp (float16 *host_inputs, float16 *host_weights, float16 *host_results, float *inputs, float *weights, float *results, int inputs_dims[4], int weights_dims[4], int results_dims[4], int inputs_align_pad, int weights_pad, int results_pad, int inputs_halo_pad[4], int row_stride, int col_stride, int ifmap_start, int kern_start, bool accumulate, bool read_inputs, bool read_weights, bool send_results, activation_type act_function, activation_param_t act_params, SamplingInfo *sampling) |
void | smv_eltwise_add_nc_vec_fxp (float16 *host_inputs0, float16 *host_inputs1, float16 *host_results, float *inputs0, float *inputs1, float *results, int inputs_size) |
void | smv_eltwise_mul_nc_vec_fxp (float16 *host_inputs0, float16 *host_inputs1, float16 *host_results, float *inputs0, float *inputs1, float *results, int inputs_size) |
void | host_load_fp16 (float *local_data, float16 *remote_data, int num_elems, int local_offset, int remote_offset) |
void | host_store_fp16 (float *local_data, float16 *remote_data, int num_elems, int local_offset, int remote_offset) |
void | smv_matrix_multiply_transpose_nc_vec_fxp (float16 *host_a, float16 *host_b, float16 *host_results, float *a, float *b, float *results, int a_dims[2], int b_dims[2], int results_dims[2], int a_pad, int b_pad, int results_pad, int a_start, int result_start, bool accumulate, bool read_inputs, bool send_results, activation_type act_function, activation_param_t act_params, SamplingInfo *sampling) |
void | smv_maxpooling_nhwc_vec_fxp (float16 *host_inputs, float16 *host_results, float *inputs, float *results, int inputs_dims[4], int results_dims[4], int inputs_pad, int results_pad, int pool_rows, int pool_cols, int row_stride, int col_stride, int ofmap_start, SamplingInfo *sampling) |
void | smv_avgpooling_nhwc_vec_fxp (float16 *host_inputs, float16 *host_results, float *inputs, float *results, int inputs_dims[4], int results_dims[4], int inputs_pad, int results_pad, int pool_rows, int pool_cols, int row_stride, int col_stride, int ofmap_start, SamplingInfo *sampling) |
void | ref_activation_fun_nc (float *inputs, float *results, int inputs_size, activation_type function, activation_param_t params) |
Top level entry point for all Reference activation functions. | |
Kernel functions meant to be run under Aladdin.
float batch_norm_op | ( | float | input, |
float | mean, | ||
float | recip_sqrt_var, | ||
float | gamma, | ||
float | beta | ||
) |
Batch normalizes one input value.
input | Input activation. |
mean | Batch mean |
recip_sqrt_var | 1/sqrt(var + eps), which is precomputed to avoid having to run a sqrt and division in the ASIC. |
gamma | Gamma parameter. |
beta | Beta parameter. |
Definition at line 23 of file ref_batch_norm_op.cpp.
v8fp_t batch_norm_simd_op | ( | v8fp_t | input, |
v8fp_t | mean, | ||
v8fp_t | recip_sqrt_var, | ||
v8fp_t | gamma, | ||
v8fp_t | beta | ||
) |
Batch normalizes one input value.
input | Input activation. |
mean | Batch mean |
recip_sqrt_var | 1/sqrt(var + eps), which is precomputed to avoid having to run a sqrt and division in the ASIC. |
gamma | Gamma parameter. |
beta | Beta parameter. |
Definition at line 25 of file batch_norm.c.
void host_load_fp16 | ( | float * | local_data, |
float16 * | remote_data, | ||
int | num_elems, | ||
int | local_offset, | ||
int | remote_offset | ||
) |
Loads half-precision fp data from the host and locally on the accelerator converts it into single-precision data.
The transfer operation is pipelined so it can be overlapped with the conversion operation. Each transfer is at most one page in size (4KB), which is converted into 8KB of data. The conversion is done in-place, so no additional SRAM is required to buffer the FP16 data.
local_data | Single-precision accelerator-local scratchpad. |
remote_data | Half-precision host memory address. |
num_elems | Number of elements to copy. |
local_offset | Offset into local array to start copying data to in elements. |
remote_offset | Offset into remote memory to start copying data from in elements. |
Definition at line 7 of file load_store_fp16_data.c.
void host_store_fp16 | ( | float * | local_data, |
float16 * | remote_data, | ||
int | num_elems, | ||
int | local_offset, | ||
int | remote_offset | ||
) |
Converts single-precision fp data from the accelerator into half-precision data and copy it to the host.
The transfer operation is pipelined so it can be overlapped with the conversion operation. Each transfer is at most one page in size (4KB), which is converted from 8KB of data. The conversion is done in-place, so no additional SRAM is required to buffer the FP16 data.
local_data | Single-precision accelerator-local scratchpad. |
remote_data | Half-precision host memory address. |
num_elems | Number of elements to copy. |
local_offset | Offset into local array to start copying data to in elements. |
remote_offset | Offset into remote memory to start copying data from in elements. |
Definition at line 45 of file load_store_fp16_data.c.
void ref_avg_pooling_nchw | ( | float * | input, |
float * | result, | ||
int | img_num, | ||
int | img_chans, | ||
int | img_rows, | ||
int | img_cols, | ||
int | img_pad, | ||
int | res_rows, | ||
int | res_cols, | ||
int | res_pad, | ||
int | pool_row_size, | ||
int | pool_col_size, | ||
int | pool_row_stride, | ||
int | pool_col_stride | ||
) |
A Reference implementation of AvgPoolingOp on NCHW data.
Definition at line 256 of file ref_pooling_op.cpp.
void ref_avg_pooling_nhwc | ( | float * | input, |
float * | result, | ||
int | img_num, | ||
int | img_chans, | ||
int | img_rows, | ||
int | img_cols, | ||
int | img_pad, | ||
int | res_rows, | ||
int | res_cols, | ||
int | res_pad, | ||
int | pool_row_size, | ||
int | pool_col_size, | ||
int | pool_row_stride, | ||
int | pool_col_stride | ||
) |
AladdinKernels Reference implementation of AvgPoolingOp on NHWC data.
Definition at line 308 of file ref_pooling_op.cpp.
void ref_batch_norm_nchw_post_conv | ( | float * | inputs, |
float * | mean, | ||
float * | variance, | ||
float * | gamma, | ||
float * | beta, | ||
float * | result, | ||
int | img_nums, | ||
int | img_chans, | ||
int | img_rows, | ||
int | img_cols, | ||
int | img_pad, | ||
int | wgt_pad, | ||
activation_type | act_function, | ||
activation_param_t | act_params | ||
) |
A Reference implementation of batch normalization following a convolutional/pooling layer on NCHW data.
After conv/pooling, we only have a gamma/beta per output feature map, not per activation.
Definition at line 85 of file ref_batch_norm_op.cpp.
void ref_batch_norm_nhwc_post_conv | ( | float * | inputs, |
float * | mean, | ||
float * | variance, | ||
float * | gamma, | ||
float * | beta, | ||
float * | result, | ||
int | img_nums, | ||
int | img_rows, | ||
int | img_cols, | ||
int | img_chans, | ||
int | img_pad, | ||
int | wgt_pad, | ||
activation_type | act_function, | ||
activation_param_t | act_params | ||
) |
A Reference implementation of batch normalization following a convolutional/pooling layer on NHWC data.
After conv/pooling, we only have a gamma/beta per output feature map, not per activation.
Definition at line 147 of file ref_batch_norm_op.cpp.
void ref_batch_norm_post_fc | ( | float * | inputs, |
float * | mean, | ||
float * | variance, | ||
float * | gamma, | ||
float * | beta, | ||
float * | result, | ||
int | input_nums, | ||
int | input_size, | ||
int | input_pad, | ||
activation_type | act_function, | ||
activation_param_t | act_params | ||
) |
A Reference implementation of batch normalization following a fully-connected layer.
In this case, we have one pair of gamma/beta weights per activation.
Definition at line 40 of file ref_batch_norm_op.cpp.
void ref_conv2d_nchw_same_padding | ( | float * | input, |
float * | kernels, | ||
float * | result, | ||
int | img_num, | ||
int | img_chans, | ||
int | img_rows, | ||
int | img_cols, | ||
int | img_pad, | ||
int | k_rows, | ||
int | k_cols, | ||
int | k_pad, | ||
int | k_row_stride, | ||
int | k_col_stride, | ||
int | res_rows, | ||
int | res_cols, | ||
int | res_pad | ||
) |
A Reference implementation of a depthwise convolution on NCHW data with same padding.
Definition at line 83 of file ref_depthwise_convolution_op.cpp.
void ref_conv2d_nchw_valid_padding | ( | float * | input, |
float * | kernels, | ||
float * | result, | ||
int | img_num, | ||
int | img_chans, | ||
int | img_rows, | ||
int | img_cols, | ||
int | img_pad, | ||
int | k_rows, | ||
int | k_cols, | ||
int | k_pad, | ||
int | k_row_stride, | ||
int | k_col_stride, | ||
int | res_rows, | ||
int | res_cols, | ||
int | res_pad | ||
) |
A Reference implementation of a depthwise convolution on NCHW data with valid padding. The Reference backend requires no alignment padding so all _pad parameters can be zero.
Definition at line 15 of file ref_depthwise_convolution_op.cpp.
void ref_conv3d_nchw_same_padding | ( | float * | input, |
float * | kernels, | ||
float * | result, | ||
int | img_num, | ||
int | img_chans, | ||
int | img_rows, | ||
int | img_cols, | ||
int | img_pad, | ||
int | k_num, | ||
int | k_rows, | ||
int | k_cols, | ||
int | k_pad, | ||
int | k_row_stride, | ||
int | k_col_stride, | ||
int | res_rows, | ||
int | res_cols, | ||
int | res_pad, | ||
activation_type | act_function, | ||
activation_param_t | act_params | ||
) |
A Reference implementation of a 3D convolution on NCHW data with same padding.
Definition at line 94 of file ref_convolution_op.cpp.
void ref_conv3d_nchw_valid_padding | ( | float * | input, |
float * | kernels, | ||
float * | result, | ||
int | img_num, | ||
int | img_chans, | ||
int | img_rows, | ||
int | img_cols, | ||
int | img_pad, | ||
int | k_num, | ||
int | k_rows, | ||
int | k_cols, | ||
int | k_pad, | ||
int | k_row_stride, | ||
int | k_col_stride, | ||
int | res_rows, | ||
int | res_cols, | ||
int | res_pad, | ||
activation_type | act_function, | ||
activation_param_t | act_params | ||
) |
A Reference implementation of a 3D convolution on NCHW data with valid padding.
Definition at line 16 of file ref_convolution_op.cpp.
void ref_conv3d_nhwc_same_padding | ( | float * | input, |
float * | kernels, | ||
float * | result, | ||
int | img_num, | ||
int | img_chans, | ||
int | img_rows, | ||
int | img_cols, | ||
int | img_pad, | ||
int | k_num, | ||
int | k_rows, | ||
int | k_cols, | ||
int | k_pad, | ||
int | k_row_stride, | ||
int | k_col_stride, | ||
int | res_rows, | ||
int | res_cols, | ||
int | res_pad, | ||
activation_type | act_function, | ||
activation_param_t | act_params | ||
) |
A Reference implementation of a 3D convolution on NHWC data with same padding.
Definition at line 266 of file ref_convolution_op.cpp.
void ref_conv3d_nhwc_valid_padding | ( | float * | input, |
float * | kernels, | ||
float * | result, | ||
int | img_num, | ||
int | img_chans, | ||
int | img_rows, | ||
int | img_cols, | ||
int | img_pad, | ||
int | k_num, | ||
int | k_rows, | ||
int | k_cols, | ||
int | k_pad, | ||
int | k_row_stride, | ||
int | k_col_stride, | ||
int | res_rows, | ||
int | res_cols, | ||
int | res_pad, | ||
activation_type | act_function, | ||
activation_param_t | act_params | ||
) |
A Reference implementation of a 3D convolution on NHWC data with valid padding.
Definition at line 188 of file ref_convolution_op.cpp.
void ref_eltwise_add | ( | float * | input0, |
float * | input1, | ||
float * | results, | ||
int | input_size | ||
) |
A Reference implementation of elementwise addition.
Definition at line 13 of file ref_eltwise_add_op.cpp.
void ref_eltwise_mul | ( | float * | input0, |
float * | input1, | ||
float * | results, | ||
int | input_size | ||
) |
A Reference implementation of elementwise multiplication.
Definition at line 13 of file ref_eltwise_mul_op.cpp.
void ref_greater | ( | float * | input0, |
float * | input1, | ||
bool * | results, | ||
int | input_size | ||
) |
A Reference implementation of elementwise greater-than.
Definition at line 13 of file ref_greater_op.cpp.
void ref_greater_equal | ( | float * | input0, |
float * | input1, | ||
bool * | results, | ||
int | input_size | ||
) |
A Reference implementation of elementwise greater-than-or-equal-to.
Definition at line 27 of file ref_greater_op.cpp.
void ref_inner_product_ab_times_bc | ( | float * | a, |
float * | b, | ||
float * | c, | ||
int | a_height, | ||
int | a_width, | ||
int | b_width, | ||
int | a_pad, | ||
int | b_pad, | ||
int | c_pad, | ||
activation_type | act_function, | ||
activation_param_t | act_params | ||
) |
A Reference implementation of an inner product operator: C = A x B
.
a | A matrix of dimensions a_height x a_width |
b | A matrix of dimensions a_width x b_width |
c | A matrix of dimensions a_height x b_width |
a_height | Number of rows in A |
a_width | Number of columns in A |
b_width | Number of columns in B |
a_pad | Additional alignment zero-padding on a. |
b_pad | Additional alignment zero-padding on b. |
c_pad | Additional alignment zero-padding on c. |
act_function | The activation function to apply on the result of the inner product. |
act_params | Parameters to the activation function. |
Definition at line 28 of file ref_inner_product_op.cpp.
void ref_inner_product_ab_times_cb | ( | float * | a, |
float * | b, | ||
float * | c, | ||
int | a_height, | ||
int | b_width, | ||
int | b_height, | ||
int | a_pad, | ||
int | b_pad, | ||
int | c_pad, | ||
activation_type | act_function, | ||
activation_param_t | act_params | ||
) |
A Reference implementation of an inner product operator. C = A x * B_transpose
.
a | A matrix of dimensions a_height x b_width |
b | A matrix of dimensions b_height x b_width |
c | A matrix of dimensions a_height x b_width |
a_height | Number of rows in A |
b_width | Number of columns in B |
b_height | Number of rows in B |
a_pad | Additional alignment zero-padding on a. |
b_pad | Additional alignment zero-padding on b. |
c_pad | Additional alignment zero-padding on c. |
act_function | The activation function to apply on the result of the inner product. |
act_params | Parameters to the activation function. |
Definition at line 87 of file ref_inner_product_op.cpp.
void ref_less | ( | float * | input0, |
float * | input1, | ||
bool * | results, | ||
int | input_size | ||
) |
A Reference implementation of less-than.
Definition at line 13 of file ref_less_op.cpp.
void ref_less_equal | ( | float * | input0, |
float * | input1, | ||
bool * | results, | ||
int | input_size | ||
) |
A Reference implementation of less-than-or-equal-to.
Definition at line 27 of file ref_less_op.cpp.
void ref_max_pooling_nchw_itermax | ( | float * | input, |
float * | result, | ||
int | img_num, | ||
int | img_chans, | ||
int | img_rows, | ||
int | img_cols, | ||
int | img_pad, | ||
int | res_rows, | ||
int | res_cols, | ||
int | res_pad, | ||
int | pool_row_size, | ||
int | pool_col_size, | ||
int | pool_row_stride, | ||
int | pool_col_stride | ||
) |
A Reference implementation of MaxPoolingOp on NCHW data, using a loop-based maximum function.
Definition at line 151 of file ref_pooling_op.cpp.
void ref_max_pooling_nchw_treemax | ( | float * | input, |
float * | result, | ||
int | img_num, | ||
int | img_chans, | ||
int | img_rows, | ||
int | img_cols, | ||
int | img_pad, | ||
int | res_rows, | ||
int | res_cols, | ||
int | res_pad, | ||
int | pool_row_size, | ||
int | pool_col_size, | ||
int | pool_row_stride, | ||
int | pool_col_stride | ||
) |
A Reference implementation of MaxPoolingOp on NCHW data, using a tree-based maximum function.
Definition at line 15 of file ref_pooling_op.cpp.
void ref_max_pooling_nhwc_itermax | ( | float * | input, |
float * | result, | ||
int | img_num, | ||
int | img_chans, | ||
int | img_rows, | ||
int | img_cols, | ||
int | img_pad, | ||
int | res_rows, | ||
int | res_cols, | ||
int | res_pad, | ||
int | pool_row_size, | ||
int | pool_col_size, | ||
int | pool_row_stride, | ||
int | pool_col_stride | ||
) |
A Reference implementation of MaxPoolingOp on NHWC data, using a loop-based maximum function.
Definition at line 204 of file ref_pooling_op.cpp.
void ref_max_pooling_nhwc_treemax | ( | float * | input, |
float * | result, | ||
int | img_num, | ||
int | img_chans, | ||
int | img_rows, | ||
int | img_cols, | ||
int | img_pad, | ||
int | res_rows, | ||
int | res_cols, | ||
int | res_pad, | ||
int | pool_row_size, | ||
int | pool_col_size, | ||
int | pool_row_stride, | ||
int | pool_col_stride | ||
) |
A Reference implementation of MaxPoolingOp on NHWC data, using a tree-based maximum function.
Definition at line 83 of file ref_pooling_op.cpp.
void ref_softmax_nc | ( | float * | inputs, |
float * | results, | ||
int | input_num, | ||
int | input_size, | ||
int | input_pad | ||
) |
A Reference implementation of the softmax function.
The softmax function exponentiates each element and then normalizes each row to sum to 1. To improve numerical stability, we use the max trick: all elements are first subtracted by the maximum value in each input before being exponentiated.
inputs | Matrix of size input_num x input_size, stored rowmajor. This contains both inputs and the outputs. |
results | Output array. |
input_num | Batch size. |
input_size | Number of activations per input. |
input_pad | Alignment padding. |
Definition at line 27 of file ref_softmax_op.cpp.
void smv_activation_fun_nc_vec_fxp | ( | float16 * | host_inputs, |
float16 * | host_results, | ||
float * | inputs, | ||
float * | results, | ||
int | inputs_size, | ||
activation_type | function, | ||
activation_param_t | params | ||
) |
Top level function entry for all unary SMV activation functions.
Definition at line 13 of file activation_functions_simd.c.
void smv_avgpooling_nhwc_vec_fxp | ( | float16 * | host_inputs, |
float16 * | host_results, | ||
float * | inputs, | ||
float * | results, | ||
int | inputs_dims[4], | ||
int | results_dims[4], | ||
int | inputs_pad, | ||
int | results_pad, | ||
int | pool_rows, | ||
int | pool_cols, | ||
int | row_stride, | ||
int | col_stride, | ||
int | ofmap_start, | ||
SamplingInfo * | sampling | ||
) |
An average-pooling operation on SMV with NHWC format. This is the vectorized implementation.
This requires a blocked channel data format (GNHWC), where G = channels/8, and the last dimension = chans = 8. The last dimension MUST be 8. This supports arbitrary pooling sizes and strides.
host_inputs | Host inputs buffer in NHWC. |
host_results | Host results buffer in NHWC. |
inputs | Local inputs buffer in NHWC. |
results | Local results buffer in NHWC. |
inputs_dims | Dimensions of the inputs. |
results_dims | Dimensions of the results. |
inputs_pad | Align padding size on the channel dimension of the inputs. |
results_pad | Align padding size on the channel dimension of the results. |
pool_rows | Row size of the pooling function. |
pool_cols | Column size of the pooling function. |
row_stride | Stride size on the row dimension. |
col_stride | Stride size on the col dimension. |
ofmap_start | If the results contains more channels than the inputs, start from this one. Otherwise this should always be zero. |
sampling | Simulation samplng settings. |
void smv_batch_norm_post_conv_nchw_vec_fxp | ( | float16 * | host_inputs, |
float16 * | host_weights, | ||
float16 * | host_results, | ||
float * | inputs, | ||
float * | weights, | ||
float * | results, | ||
int | inputs_dims[4], | ||
int | weights_chans, | ||
int | inputs_pad, | ||
int | weights_pad, | ||
int | weights_start, | ||
activation_type | act_function, | ||
activation_param_t | act_params | ||
) |
SMV implementation of batch normalization following a convolutional/pooling layer on NCHW data.
After conv/pooling, we only have a gamma/beta per output feature map, not per activation.
Definition at line 100 of file batch_norm.c.
void smv_batch_norm_post_conv_nhwc_vec_fxp | ( | float16 * | host_inputs, |
float16 * | host_weights, | ||
float16 * | host_results, | ||
float * | inputs, | ||
float * | weights, | ||
float * | results, | ||
int | inputs_dims[4], | ||
int | weights_chans, | ||
int | inputs_pad, | ||
int | weights_pad, | ||
int | weights_start, | ||
activation_type | act_function, | ||
activation_param_t | act_params, | ||
SamplingInfo * | sampling | ||
) |
SMV implementation of batch normalization following a convolutional/pooling layer on NHWC data.
After conv/pooling, we only have a gamma/beta per output feature map, not per activation.
Definition at line 196 of file batch_norm.c.
void smv_batch_norm_post_fc_nc_vec_fxp | ( | float16 * | host_inputs, |
float16 * | host_weights, | ||
float16 * | host_results, | ||
float * | inputs, | ||
float * | weights, | ||
float * | results, | ||
int | inputs_dims[2], | ||
int | weights_acts, | ||
int | inputs_pad, | ||
int | inputs_start, | ||
int | send_results, | ||
activation_type | act_function, | ||
activation_param_t | act_params | ||
) |
SMV implementation of batch normalization following a fully-connected layer.
In this case, we have one pair of gamma/beta weights per activation.
Definition at line 41 of file batch_norm.c.
void smv_conv3d_nhwc_vec_fxp | ( | float16 * | host_inputs, |
float16 * | host_weights, | ||
float16 * | host_results, | ||
float * | inputs, | ||
float * | weights, | ||
float * | results, | ||
int | inputs_dims[4], | ||
int | weights_dims[4], | ||
int | results_dims[4], | ||
int | inputs_align_pad, | ||
int | weights_pad, | ||
int | results_pad, | ||
int | inputs_halo_pad[4], | ||
int | row_stride, | ||
int | col_stride, | ||
int | ifmap_start, | ||
int | kern_start, | ||
bool | accumulate, | ||
bool | read_inputs, | ||
bool | read_weights, | ||
bool | send_results, | ||
activation_type | act_function, | ||
activation_param_t | act_params, | ||
SamplingInfo * | sampling | ||
) |
Perform a 3D convolution with one kernel on an image, with reduction in NHWC format. This is the vectorized implementation.
host_inputs | Host inputs buffer in NHWC. |
host_weights | Host weights buffer in NHWC. |
host_results | Host results buffer in NHWC. |
inputs | Local inputs buffer in NHWC. |
weights | Local weights buffer in NHWC. |
results | Local results buffer in NHWC. |
inputs_dims | Dimensions of the inputs. |
weights_dims | Dimensions of the weights. |
results_dims | Dimensions of the results. |
inputs_align_pad | Alignment padding size on the channel dimension of the inputs. |
weights_pad | Alignment padding size on the channel dimension of the weights. |
results_pad | Alignment padding size on the channel dimension of the results. |
inputs_halo_pad | Padding sizes on top, bottom, left and right of the input 2D feature maps. |
row_stride | Stride size on the row dimension. |
col_stride | Stride size on the col dimension. |
ifmap_start | If the input contains more channels than the weights, start from this one. Otherwise this should always be zero. |
kern_start | If the weights contain more kernels than the results buffer can fit, start from this one. Otherwise this should always be zero. |
accumulate | If the original weight tensor is tiled channelwise, this should be set to true in order to avoid resetting the result buffer for non-first weight tiles. |
read_inputs | Load inputs from the host. Set to false if the input activations can be reused from the last invocation. |
read_weights | Load weights from the host. Set to false if the weights can be reused from the last invocation. |
send_results | Send the results to the host memory if this is true. |
act_function | Activation function the operator runs. |
act_params | Parameters for the activation function. |
sampling | Simulation samplng settings. |
Definition at line 53 of file convolution_simd.c.
void smv_eltwise_add_nc_vec_fxp | ( | float16 * | host_inputs0, |
float16 * | host_inputs1, | ||
float16 * | host_results, | ||
float * | inputs0, | ||
float * | inputs1, | ||
float * | results, | ||
int | inputs_size | ||
) |
SMV implementation of elementwise addition.
Definition at line 13 of file eltwise_add.c.
void smv_eltwise_mul_nc_vec_fxp | ( | float16 * | host_inputs0, |
float16 * | host_inputs1, | ||
float16 * | host_results, | ||
float * | inputs0, | ||
float * | inputs1, | ||
float * | results, | ||
int | inputs_size | ||
) |
SMV implementation of elementwise multiplication.
Definition at line 13 of file eltwise_mul.c.
void smv_greater_equal_nc_vec_fxp | ( | float16 * | host_inputs0, |
float16 * | host_inputs1, | ||
bool * | host_results, | ||
float * | inputs0, | ||
float * | inputs1, | ||
bool * | results, | ||
int | inputs_size | ||
) |
void smv_greater_nc_vec_fxp | ( | float16 * | host_inputs0, |
float16 * | host_inputs1, | ||
bool * | host_results, | ||
float * | inputs0, | ||
float * | inputs1, | ||
bool * | results, | ||
int | inputs_size | ||
) |
void smv_less_equal_nc_vec_fxp | ( | float16 * | host_inputs0, |
float16 * | host_inputs1, | ||
bool * | host_results, | ||
float * | inputs0, | ||
float * | inputs1, | ||
bool * | results, | ||
int | inputs_size | ||
) |
void smv_less_nc_vec_fxp | ( | float16 * | host_inputs0, |
float16 * | host_inputs1, | ||
bool * | host_results, | ||
float * | inputs0, | ||
float * | inputs1, | ||
bool * | results, | ||
int | inputs_size | ||
) |
void smv_matrix_multiply_transpose_nc_vec_fxp | ( | float16 * | host_a, |
float16 * | host_b, | ||
float16 * | host_results, | ||
float * | a, | ||
float * | b, | ||
float * | results, | ||
int | a_dims[2], | ||
int | b_dims[2], | ||
int | results_dims[2], | ||
int | a_pad, | ||
int | b_pad, | ||
int | results_pad, | ||
int | a_start, | ||
int | result_start, | ||
bool | accumulate, | ||
bool | read_inputs, | ||
bool | send_results, | ||
activation_type | act_function, | ||
activation_param_t | act_params, | ||
SamplingInfo * | sampling | ||
) |
Matrix b after transposition:
cols (originally rows) —>
rows [[—][—][—]] | [[—][—][—]] [[—][—][—]] v [[—][—][—]]
Each [—] represents an 8-wide vector. This inner product executes a 32-way MACC – 4 such 8-wide vectors – per PE, and 8 PEs, where each PE is assigned a row in in the transposed matrix. It continues across each row of b until the complete output pixel is finished (output stationary).
No biases are added.
Args:
host_a | Host buffer for a in NC. |
host_b | Host buffer for b in NC. |
host_results | Host results buffer in NC. |
a | Local buffer for a in NC. |
b | Local buffer for b in NC. |
results | Local results buffer in NC. |
a_dims | Dimensions of a. |
b_dims | Dimensions of b. |
results_dims | Dimensions of the results. |
a_pad | Align padding size on the channel dimension of a. |
b_pad | Align padding size on the channel dimension of b. |
results_pad | Align padding size on the channel dimension of the results. |
a_start | If a contains more activations than b, start from this one. Otherwise this should always be zero. |
result_start | If the results contain more neurons than the b, start writing results from this one. Otherwise this should always be zero. |
accumulate | If the original b tensor is tiled on activations, this should be set to true in order to avoid resetting the result buffer for knon-first b tiles. |
read_inputs | Load inputs from the host. Set to false if the input activations can be reused from the last invocation. |
send_results | Send the results to the host memory if this is true. |
act_function | Activation function the operator runs. |
act_params | Parameters for the activation function. |
sampling | Simulation samplng settings. |
Definition at line 59 of file matrix_multiply.c.
void smv_maxpooling_nhwc_vec_fxp | ( | float16 * | host_inputs, |
float16 * | host_results, | ||
float * | inputs, | ||
float * | results, | ||
int | inputs_dims[4], | ||
int | results_dims[4], | ||
int | inputs_pad, | ||
int | results_pad, | ||
int | pool_rows, | ||
int | pool_cols, | ||
int | row_stride, | ||
int | col_stride, | ||
int | ofmap_start, | ||
SamplingInfo * | sampling | ||
) |
A max-pooling operation on SMV with NHWC format. This is the vectorized implementation.
Args:
host_inputs | Host inputs buffer in NHWC. |
host_results | Host results buffer in NHWC. |
inputs | Local inputs buffer in NHWC. |
results | Local results buffer in NHWC. |
inputs_dims | Dimensions of the inputs. |
results_dims | Dimensions of the results. |
inputs_pad | Align padding size on the channel dimension of the inputs. |
results_pad | Align padding size on the channel dimension of the results. |
pool_rows | Row size of the pooling function. |
pool_cols | Column size of the pooling function. |
row_stride | Stride size on the row dimension. |
col_stride | Stride size on the col dimension. |
ofmap_start | If the results contains more channels than the inputs, start from this one. Otherwise this should always be zero. |
sampling | Simulation samplng settings. |
void smv_softmax_nc_vec_fxp | ( | float16 * | host_inputs, |
float16 * | host_results, | ||
float * | inputs, | ||
float * | results, | ||
int | input_num, | ||
int | input_size, | ||
int | input_pad | ||
) |
Top level function for softmax.
Definition at line 31 of file activation_functions_simd.c.