SMAUG
Simulating Machine Learning Applications on gem5-Aladdin
smv_batch_norm_op.cpp
1 #include "smaug/core/backend.h"
3 #include "smaug/operators/smv/smv_batch_norm_op.h"
4 #include "smaug/operators/smv/smv_batch_norm_tiling.h"
5 #include "smaug/operators/smv/smv_kernels.h"
6 #include "smaug/operators/smv/smv_accel_pool.h"
7 #include "smaug/utility/debug_stream.h"
8 
9 namespace smaug {
10 namespace smv {
11 namespace bn {
12 
13 const int kVectorSize = 8;
14 
15 } // namespace bn
16 } // namespace smv
17 
18 // The tile dispatcher for post-FC batch norms. The tile iteration is in the
19 // following order:
20 // 1) N: batch-wise tiles in the inputs.
21 // 2) A: activation-wise tiles in the inputs/weights.
23  TiledTensor& weights,
24  TiledTensor& outputs) {
25  int inputNumTiles = inputs.getShape()[0];
26  int inputActTiles = inputs.getShape()[1];
27  int weightActTiles = weights.getShape()[1];
28  auto inputIdx = inputs.startIndex();
29  auto weightIdx = weights.startIndex();
30  auto outputIdx = outputs.startIndex();
32  smv::kBatchNormHw, "host_inputs", getInputsMemType());
34  smv::kBatchNormHw, "host_weights", getWeightsMemType());
36  smv::kBatchNormHw, "host_results", getOutputsMemType());
37  for (int N = 0; N < inputNumTiles; N++) {
38  int iC = 0, wC = 0;
39  // This keeps track of the activation offset of the inputs.
40  int actOffset = 0;
41  while (iC < inputActTiles && wC < weightActTiles) {
42  int inputTileIdx = inputIdx(N, iC);
43  int weightTileIdx = weightIdx(0, wC);
44  int outputTileIdx = outputIdx(N, iC);
45  dout(1) << "Input: " << inputIdx(N, iC)
46  << ", weight: " << weightIdx(0, wC)
47  << ", output: " << outputIdx(N, iC) << "\n";
48  Tensor* inputTile = inputs.getTileWithData(inputTileIdx);
49  Tensor* weightsTile = weights.getTileWithData(weightTileIdx);
50  Tensor* outputTile = outputs[outputTileIdx];
51  const TensorShape& inputShape = inputTile->getShape();
52  const TensorShape& weightsShape = weightsTile->getShape();
53  const TensorShape& outputShape = outputTile->getShape();
54  mapArrayToAccel(smv::kBatchNormHw, "host_inputs",
55  inputTile->data<float16>(),
56  inputShape.storageSize() * sizeof(float16));
57  mapArrayToAccel(smv::kBatchNormHw, "host_weights",
58  weightsTile->data<float16>(),
59  weightsShape.storageSize() * sizeof(float16));
60  mapArrayToAccel(smv::kBatchNormHw, "host_results",
61  outputTile->data<float16>(),
62  outputShape.storageSize() * sizeof(float16));
63  int inputDims[2] = { inputShape[0], inputShape[1] };
64  // If the input and weight tiles belong to the same channel
65  // group, then their data will be loaded at the same time into
66  // the spads, so we start from the beginning of the tile.
67  // Otherwise, we start from the last place we left off from.
68  int actStart = (iC == wC) ? 0 : actOffset;
69  // Send the results back to host memory when we finish the weights.
70  bool sendOutputs = iC == wC || wC == weightActTiles - 1;
71 
73  inputTile->data<float16>(),
74  weightsTile->data<float16>(),
75  outputTile->data<float16>(), smv::spad0, smv::spad1,
76  smv::spad2, inputDims, weightsShape[1],
77  inputShape.getPadding(1), actStart, sendOutputs,
78  actInfo.function, actInfo.params);
79 
80  actOffset += weightsTile->getShape()[1];
81  if (inputActTiles == weightActTiles) {
82  iC++;
83  wC++;
84  } else if (inputActTiles == 1) {
85  wC++;
86  } else {
87  assert(false && "The input/weight tiles can have different "
88  "number of channels only when the inputs "
89  "don't need activation-wise tiling.");
90  }
91  }
92  }
93 }
94 
95 // The tile dispatcher for post-convolution batch norms. The tile iteration is
96 // in the following order:
97 // 1) N: batch-wise tiles in the inputs.
98 // 2) H: row-wise tiles in the inputs.
99 // 3) W: column-wise tiles in the inputs.
100 // 4) C: channel-wise tiles in the inputs.
102  TiledTensor& weights,
103  TiledTensor& outputs) {
104  // Ordinarily, we don't need to tile the weights.
105  assert(weights.size() == 1);
106  int inputNumTiles = inputs.getShape()[0];
107  int inputRowTiles = inputs.getShape()[1];
108  int inputColTiles = inputs.getShape()[2];
109  int inputChanTiles = inputs.getShape()[3];
110  auto inputIdx = inputs.startIndex();
111  auto outputIdx = outputs.startIndex();
112  Tensor* weightTile = weights.getTileWithData(0);
113  const TensorShape& weightShape = weightTile->getShape();
114  for (int i = 0; i < numAcceleratorsAvailable; i++) {
115  mapArrayToAccel(smv::kBatchNormHw + i, "host_weights",
116  weightTile->data<float16>(),
117  weightShape.storageSize() * sizeof(float16));
119  smv::kBatchNormHw + i, "host_inputs", getInputsMemType());
121  smv::kBatchNormHw + i, "host_weights", getWeightsMemType());
123  smv::kBatchNormHw + i, "host_results", getOutputsMemType());
124  }
126  int currAccelIdx = 0;
127  for (int N = 0; N < inputNumTiles; N++) {
128  for (int H = 0; H < inputRowTiles; H++) {
129  for (int W = 0; W < inputColTiles; W++) {
130  // This keeps track of the channel offset of the inputs.
131  int ifmapOffset = 0;
132  for (int C = 0; C < inputChanTiles; C++) {
133  int inputTileIdx = inputIdx(N, H, W, C);
134  int outputTileIdx = outputIdx(N, H, W, C);
135  dout(1) << "Input: " << inputTileIdx << ", Weight: 0"
136  << ", output: " << outputTileIdx << "\n";
137  Tensor* inputTile = inputs.getTileWithData(inputTileIdx);
138  Tensor* outputTile = outputs[outputTileIdx];
139  const TensorShape& inputShape = inputTile->getShape();
140  const TensorShape& outputShape = outputTile->getShape();
141  mapArrayToAccel(smv::kBatchNormHw + currAccelIdx,
142  "host_inputs", inputTile->data<float16>(),
143  inputShape.storageSize() * sizeof(float16));
145  smv::kBatchNormHw + currAccelIdx, "host_results",
146  outputTile->data<float16>(),
147  outputShape.storageSize() * sizeof(float16));
148  int inputDims[4] = { inputShape[0], inputShape[1],
149  inputShape[2], inputShape[3] };
150 
151  std::unique_ptr<volatile int> finishFlag =
153  currAccelIdx,
154  smv::kBatchNormHw + currAccelIdx,
156  inputTile->data<float16>(),
157  weightTile->data<float16>(),
158  outputTile->data<float16>(), smv::spad0,
159  smv::spad1, smv::spad2, inputDims,
160  weightShape[1], inputShape.getPadding(3),
161  weightShape.getPadding(1), ifmapOffset,
162  actInfo.function, actInfo.params,
163  &sampling);
164  accelPool.addFinishFlag(
165  currAccelIdx, std::move(finishFlag));
166  ifmapOffset += inputShape[3];
167  currAccelIdx =
168  accelPool.getNextAvailableAccelerator(currAccelIdx);
169  }
170  }
171  }
172  }
173  accelPool.joinAll();
174 }
175 
176 void SmvBatchNormOp::tile() {
177  // This function will tile (if necessary) the input/weight/output tensors
178  // of the batch norm operator into smaller tensor tiles so that each tile
179  // can fit in the corresponding scratchpad of the accelerator. It merges
180  // the four weights tensors into one and does tiling on it.
181  tiledTensors = smaug::smv::bn::TilingOptimizer::doTiling(this);
182 }
183 
184 void SmvBatchNormOp::run() {
185  using namespace smaug::smv::bn;
186  auto input = getInput(Inputs);
187  auto mean = getInput(Mean);
188  auto variance = getInput(Variance);
189  auto gamma = getInput(Gamma);
190  auto beta = getInput(Beta);
191  auto output = getOutput(Outputs);
192  const TensorShape& inputShape = input->getShape();
193  const TensorShape& kernelShape = mean->getShape();
194  const TensorShape& outputShape = output->getShape();
195  bool isPostConv = (input->ndims() == 4);
196  dout(2) << *mean << "\n";
197  dout(2) << *variance<< "\n";
198  dout(2) << *gamma << "\n";
199  dout(2) << *beta << "\n";
200 
201  {
202  auto stats = gem5::ScopedStats(
203  stats::kTensorPrepStart, stats::kTensorPrepEnd);
204  tiledTensors[0].copyDataToAllTiles();
205  tiledTensors[1].copyDataToAllTiles();
206  }
207 
208  if (isPostConv) {
209  assert(inputShape.getLayout() == DataLayout::NHWC);
210  assert(outputShape.getLayout() == DataLayout::NHWC);
211  runNHWC(tiledTensors[0], tiledTensors[1], tiledTensors[2]);
212  } else {
213  assert(inputShape.getLayout() == DataLayout::NC);
214  assert(outputShape.getLayout() == DataLayout::NC);
215  runNA(tiledTensors[0], tiledTensors[1], tiledTensors[2]);
216  }
217 
218  {
219  auto stats = gem5::ScopedStats(
220  stats::kTensorFinalStart, stats::kTensorFinalEnd);
221  tiledTensors[2].untile();
222  }
223 }
224 
225 } // namespace smaug
smv_batch_norm_post_conv_nhwc_vec_fxp
void smv_batch_norm_post_conv_nhwc_vec_fxp(float16 *host_inputs, float16 *host_weights, float16 *host_results, float *inputs, float *weights, float *results, int inputs_dims[4], int weights_chans, int inputs_pad, int weights_pad, int weights_start, activation_type act_function, activation_param_t act_params, SamplingInfo *sampling)
Definition: batch_norm.c:196
smaug::Tensor
Tensor represents a single multi-dimensional array of data.
Definition: tensor.h:344
smaug::Tensor::data
const T * data() const
Returns a const pointer to the Tensor data.
Definition: tensor.h:521
smaug::numAcceleratorsAvailable
int numAcceleratorsAvailable
The actual number of accelerator complexes currently in use.
Definition: globals.cpp:6
smaug::dout
const DebugStream & dout(int debugLevel)
Returns a DebugStream instance for the given debug level.
Definition: debug_stream.cpp:16
smaug::TiledTensor::getTileWithData
Tensor * getTileWithData(int index)
Returns a Tensor at the specified tile position, with data copied from the original tensor.
Definition: tensor.cpp:65
smaug::smv::bn
Contains batch-norm implementations and tiling optimizers for SMV.
Definition: smv_batch_norm_op.cpp:11
smaug::TiledTensor
A multidimensional container of Tensors.
Definition: tensor.h:552
smaug::invokeKernelNoBlock
std::unique_ptr< volatile int > invokeKernelNoBlock(int accelIdx, unsigned reqCode, const Kernel &kernel, Args &&... args)
A generic non-blocking interface to accelerated kernel functions.
Definition: common.h:106
smaug::setArrayMemTypeIfSimulating
void setArrayMemTypeIfSimulating(unsigned reqCode, const char *arrayName, MemoryType memType)
Sets what memory access mechanism the accelerator will use when accessing this array.
Definition: common.cpp:21
smaug::SmvBatchNormOp::runNHWC
void runNHWC(TiledTensor &inputs, TiledTensor &weights, TiledTensor &outputs)
Post-convolution tile dispatcher.
Definition: smv_batch_norm_op.cpp:101
smaug::TensorShape
TensorShape describes the shape of a Tensor.
Definition: tensor.h:35
smaug::SmvAcceleratorPool::addFinishFlag
void addFinishFlag(int accelIdx, std::unique_ptr< volatile int > finishFlag)
Add a finish flag for the specified accelerator.
Definition: smv_accel_pool.cpp:12
smaug::SmvAcceleratorPool::joinAll
void joinAll()
Wait until all the finish flags turn complete.
Definition: smv_accel_pool.cpp:32
smaug::smv::bn::TilingOptimizer::doTiling
static std::array< TiledTensor, 3 > doTiling(SmvBatchNormOp *op)
Runs the tiling optimizer on the given batch norm op.
Definition: smv_batch_norm_tiling.cpp:260
smv_batch_norm_post_fc_nc_vec_fxp
void smv_batch_norm_post_fc_nc_vec_fxp(float16 *host_inputs, float16 *host_weights, float16 *host_results, float *inputs, float *weights, float *results, int inputs_dims[2], int weights_acts, int inputs_pad, int inputs_start, int send_results, activation_type act_function, activation_param_t act_params)
Definition: batch_norm.c:41
smaug
The smaug namespace is the parent namespace of all C++ code in SMAUG.
Definition: backend.cpp:38
common.h
Utilities for writing and invoking Aladdin kernels from Operators.
smaug::SmvAcceleratorPool
Implements a pool of worker accelerators.
Definition: smv_accel_pool.h:32
smaug::SmvBatchNormOp::runNA
void runNA(TiledTensor &inputs, TiledTensor &weights, TiledTensor &outputs)
Post-FC tile dispatcher.
Definition: smv_batch_norm_op.cpp:22
smaug::mapArrayToAccel
void mapArrayToAccel(unsigned reqCode, const char *arrayName, void *baseAddr, size_t size)
Maps an array of data to the accelerator.
Definition: common.cpp:12
smaug::invokeKernel
void invokeKernel(int accelIdx, unsigned reqCode, const Kernel &kernel, Args &&... args)
The generic blocking interface for all accelerator kernel functions.
Definition: common.h:72
smaug::SmvAcceleratorPool::getNextAvailableAccelerator
int getNextAvailableAccelerator(int currAccelIdx)
Get the next accelerator and wait if it's still busy.
Definition: smv_accel_pool.cpp:39