SMAUG
Simulating Machine Learning Applications on gem5-Aladdin
smv_convolution_op.cpp
1 #include "smaug/core/backend.h"
3 #include "smaug/operators/smv/smv_convolution_op.h"
4 #include "smaug/operators/smv/smv_convolution_tiling.h"
5 #include "smaug/operators/smv/smv_kernels.h"
6 #include "smaug/operators/smv/smv_accel_pool.h"
7 #include "smaug/utility/debug_stream.h"
8 
9 namespace smaug {
10 namespace smv {
11 namespace conv {
12 
13 const int kNumPEs = 8;
14 const int kNumMaccsPerPE = 32;
15 
16 } // namespace conv
17 } // namespace smv
18 
20  TiledTensor& weights,
21  TiledTensor& outputs) {
22  int inputIfmapTiles = inputs.getShape()[0];
23  int inputRowTiles = inputs.getShape()[1];
24  int inputChanTiles = inputs.getShape()[3];
25  int weightOfmapTiles = weights.getShape()[0];
26  int weightChanTiles = weights.getShape()[3];
27  int outputRowTiles = outputs.getShape()[1];
28  int outputChanTiles = outputs.getShape()[3];
29  auto inputIdx = inputs.startIndex();
30  auto weightIdx = weights.startIndex();
31  auto outputIdx = outputs.startIndex();
32  std::vector<int> inputPadding = getInputPadding();
33  int topPad = inputPadding[0];
34  int bottomPad = inputPadding[1];
35  int leftPad = inputPadding[2];
36  int rightPad = inputPadding[3];
37  unsigned accelId = useSystolicArrayWhenAvailable ? smv::kSystolicArrayHw
38  : smv::kConvolutionHw;
40  std::vector<int> lastReadInputTileIdx(numAcceleratorsAvailable, -1);
41  std::vector<int> lastReadWeightTileIdx(numAcceleratorsAvailable, -1);
42  for (int i = 0; i < numAcceleratorsAvailable; i++) {
44  accelId + i, "host_inputs", getInputsMemType());
46  accelId + i, "host_weights", getWeightsMemType());
48  accelId + i, "host_results", getOutputsMemType());
49  }
50  int currAccelIdx = 0;
51  for (int N = 0; N < inputIfmapTiles; N++) {
52  for (int H = 0; H < outputRowTiles; H++) {
53  int currentTileTopPad = topPad;
54  int currentTileBottomPad = bottomPad;
55  if (inputRowTiles > 1) {
56  if (H == 0) {
57  currentTileBottomPad = 0;
58  } else if (H == inputRowTiles - 1) {
59  currentTileTopPad = 0;
60  } else {
61  currentTileTopPad = 0;
62  currentTileBottomPad = 0;
63  }
64  }
65  // This is used to specify the padding sizes on the boundaries of
66  // the 2D feature maps in an input tile.
67  int inputHaloPad[4] = { currentTileTopPad, currentTileBottomPad,
68  leftPad, rightPad };
69  // On one condition, the tiling optimizer allows the weight tile to
70  // contain more kernels than the output tile: the weights do not
71  // need N-wise tiling (weightOfmapTiles = 1), whereas the output
72  // needs channelwise tiling (weightOfmapTiles < outputChanTiles).
73  // We will then need multiple kernel invocations to finish the
74  // weight tile, where each invocation only consumes part of it. The
75  // argument 'kern_start' is used for this: it provides the starting
76  // kernel from which the weight tile will be effective.
77  bool needOutputIteration = weightOfmapTiles < outputChanTiles;
78  int kernStart = 0;
79  // This is the number of invocations we need to finish the weight
80  // tile. In common scenarios, only one invocation is needed. If we
81  // need to iterate the output channels, outputChanTiles invocatons
82  // are needed to finish the weight tile.
83  int numOutputInvocations =
84  needOutputIteration ? outputChanTiles : 1;
85  assert(numOutputInvocations > 1
86  ? weightOfmapTiles == 1
87  : weightOfmapTiles == outputChanTiles);
88  for (int W = 0; W < weightOfmapTiles; W++) {
89  // We have three loop levels up to this point, the first for
90  // input batch-wise tiles iteration, the second for input
91  // rowwise tiles iteration, the third for weight N-wise tiles
92  // iteration. There is no data dependency among the loop nests
93  // involve in these levels, and therefore we can run them
94  // in parallel.
95  //
96  // We have another two loop level beyond this point, one for
97  // output channelwise tiles iteration and the other for weight
98  // channelwise tiles iteration. We run these loop nests in
99  // serial (i.e., on one single accelerator). The ones in the
100  // latter loop accumulate results to the same output tile and
101  // thus exhibiting data dependency, whereas the former could run
102  // in parallel technically, but we will need to reload too much
103  // weights for that and therefore I choose not to.
104  for (int oC = 0; oC < numOutputInvocations; oC++) {
105  int iC = 0, wC = 0;
106  // This keeps track of the channel offset of the input.
107  int ifmapOffset = 0;
108  int outputTileIdx = outputIdx(N, H, 0, W + oC);
109  Tensor* outputTile = outputs[outputTileIdx];
110  const TensorShape& outputShape = outputTile->getShape();
112  accelId + currAccelIdx, "host_results",
113  outputTile->data<float16>(),
114  outputShape.storageSize() * sizeof(float16));
115 
116  // The tiling optimizer will make sure that the weight tiles
117  // have the same channel dimension as the input tiles (so
118  // that inputChanTiles = weightChanTiles), except one case
119  // where the input is not tiled channelwise (inputChanTiles
120  // = 1) and the weights are independently tiled channelwise.
121  // In that case, we will need multiple kernel invocations to
122  // finish the weight channelwise tiles, with the same input
123  // channel tile, producing results for the same output
124  // channels.
125  while (iC < inputChanTiles && wC < weightChanTiles) {
126  int inputTileIdx = inputIdx(N, H, 0, iC);
127  int weightTileIdx = weightIdx(W, 0, 0, wC);
128  dout(1) << "Input: " << inputTileIdx
129  << ", weights: " << weightTileIdx
130  << ", output: " << outputTileIdx << "\n";
131  Tensor* inputTile =
132  inputs.getTileWithData(inputTileIdx);
133  Tensor* weightsTile =
134  weights.getTileWithData(weightTileIdx);
135  const TensorShape& inputShape = inputTile->getShape();
136  const TensorShape& weightsShape =
137  weightsTile->getShape();
139  accelId + currAccelIdx, "host_inputs",
140  inputTile->data<float16>(),
141  inputShape.storageSize() * sizeof(float16));
143  accelId + currAccelIdx, "host_weights",
144  weightsTile->data<float16>(),
145  weightsShape.storageSize() * sizeof(float16));
146  int inputDims[4] = { inputShape[0], inputShape[1],
147  inputShape[2], inputShape[3] };
148  int weightsDims[4] = { weightsShape[0], weightsShape[1],
149  weightsShape[2],
150  weightsShape[3] };
151  int outputDims[4] = { outputShape[0], outputShape[1],
152  outputShape[2], outputShape[3] };
153  // The 'ifmap_start' argument of the kernel is for
154  // handling when inputChanTiles < weightChanTiles. It
155  // provides the starting channel of the input tile that
156  // will be effective for computation in the invocation.
157  int ifmapStart = (iC == wC) ? 0 : ifmapOffset;
158  // Since multiple weight channelwise tiles produce the
159  // same output channels, 'accumulate' is set to true to
160  // avoid resetting the result for non-first (wC > 0)
161  // weight channelwise tiles.
162  bool accumulate = wC > 0;
163  // If this is a new input/weight tile, then we need to
164  // read it.
165  bool readInputs = false;
166  if (inputTileIdx !=
167  lastReadInputTileIdx[currAccelIdx]) {
168  readInputs = true;
169  lastReadInputTileIdx[currAccelIdx] = inputTileIdx;
170  }
171  bool readWeights = false;
172  if (weightTileIdx !=
173  lastReadWeightTileIdx[currAccelIdx]) {
174  readWeights = true;
175  lastReadWeightTileIdx[currAccelIdx] = weightTileIdx;
176  }
177  // If we reach the last invocation for the weight
178  // channelwise tiles, the results are finished and need
179  // to be sent back to the host.
180  bool sendResults = wC == weightChanTiles - 1;
181 
182  std::unique_ptr<volatile int> finishFlag;
184  // Invoke the systolic array if specified.
185  finishFlag = invokeSystolicArrayKernel(
186  accelId + currAccelIdx,
187  inputTile->data<float16>(),
188  weightsTile->data<float16>(),
189  outputTile->data<float16>(), inputDims,
190  weightsDims, outputDims,
191  inputShape.getPadding(3),
192  weightsShape.getPadding(3),
193  outputShape.getPadding(3), inputHaloPad,
194  getRowStride(), ifmapStart, kernStart,
195  accumulate, readInputs, readWeights,
196  sendResults, &actInfo);
197  } else {
198  // Otherwise invoke the DLA-like kernel.
199  finishFlag = invokeKernelNoBlock(
200  currAccelIdx, accelId + currAccelIdx,
202  inputTile->data<float16>(),
203  weightsTile->data<float16>(),
204  outputTile->data<float16>(), smv::spad0,
205  smv::spad1, smv::spad2, inputDims,
206  weightsDims, outputDims,
207  inputShape.getPadding(3),
208  weightsShape.getPadding(3),
209  outputShape.getPadding(3), inputHaloPad,
210  getRowStride(), getColStride(), ifmapStart,
211  kernStart, accumulate, readInputs,
212  readWeights, sendResults, actInfo.function,
213  actInfo.params, &sampling);
214  }
215  accelPool.addFinishFlag(
216  currAccelIdx, std::move(finishFlag));
217 
218  ifmapOffset += weightsTile->getShape()[3];
219  if (inputChanTiles == weightChanTiles) {
220  iC++;
221  wC++;
222  } else if (inputChanTiles == 1) {
223  wC++;
224  } else {
225  assert(false &&
226  "The input/weight tiles can have different "
227  "number of channels only when the inputs "
228  "don't need channelwise tiling.");
229  }
230  }
231  if (needOutputIteration)
232  kernStart += outputShape[3];
233  }
234  currAccelIdx =
235  accelPool.getNextAvailableAccelerator(currAccelIdx);
236  }
237  }
238  }
239  // Before we leave, make sure all the accelerators have finished.
240  accelPool.joinAll();
241 }
242 
243 std::unique_ptr<volatile int> SmvConvolutionOp::invokeSystolicArrayKernel(
244  unsigned accelId,
245  float16* inputs,
246  float16* weights,
247  float16* outputs,
248  int inputsDims[4],
249  int weightsDims[4],
250  int outputsDims[4],
251  int inputsPad,
252  int weightsPad,
253  int outputPad,
254  int inputHaloPad[4],
255  int stride,
256  int ifmapStart,
257  int kernStart,
258  bool accumulate,
259  bool readInputs,
260  bool readWeights,
261  bool sendResults,
262  ActivationInfo* actInfo) {
263  // Note that if we are in trace mode, we should skip this gem5 accelerator.
264 #ifndef TRACE_MODE
265  assert(runningInSimulation && "The systolic array must be invoked in "
266  "simuation.");
267  systolic_array_params_t params;
268  params.input_base_addr = inputs;
269  params.weight_base_addr = weights;
270  params.output_base_addr = outputs;
271  memcpy(params.input_dims, inputsDims, sizeof(int) * 4);
272  memcpy(params.weight_dims, weightsDims, sizeof(int) * 4);
273  memcpy(params.output_dims, outputsDims, sizeof(int) * 4);
274  params.input_dims[3] += inputsPad;
275  params.weight_dims[3] += weightsPad;
276  params.output_dims[3] += outputPad;
277  params.stride = stride;
278  memcpy(params.input_halo_pad, inputHaloPad, sizeof(int) * 4);
279  params.ifmap_start = ifmapStart;
280  params.kern_start = kernStart;
281  params.accum_results = accumulate;
282  params.read_inputs = readInputs;
283  params.read_weights = readWeights;
284  params.send_results = sendResults;
285  // The systolic array kernel in gem5 uses the same
286  // activation type/params structures.
287  memcpy(&params.act_type, &(actInfo->function), sizeof(activation_type));
288  memcpy(&params.act_params, &(actInfo->params), sizeof(activation_param_t));
289  return std::unique_ptr<volatile int>(
290  invokeSystolicArrayAndReturn(accelId, params));
291 #else
292  return nullptr;
293 #endif
294 }
295 
296 void SmvConvolutionOp::tile() {
297  // This function will tile (if necessary) the input/weight/output tensors
298  // of the convolution operator into smaller tensor tiles so that each tile
299  // can fit in the corresponding scratchpad of the accelerator.
300  // TODO: A lot of networks have back to back convolutional layers, it would
301  // be much more efficient not to retile in between them. That can be
302  // achieved by directly sending the output tiles to the next convolutional
303  // layer instead of merging them into a single output tensor first. It's
304  // sort of operator fusing that two back-to-back convolution operators are
305  // tiled only once.
306  tiledTensors = smaug::smv::conv::TilingOptimizer::doTiling(this);
307 }
308 
309 void SmvConvolutionOp::run() {
310  auto input = getInput(Inputs);
311  auto kernels = getInput(Kernels);
312  auto output = getOutput(Outputs);
313  const TensorShape& inputShape = input->getShape();
314  const TensorShape& kernelShape = kernels->getShape();
315  const TensorShape& outputShape = output->getShape();
316  assert(inputShape.getLayout() == DataLayout::NHWC);
317  assert(kernelShape.getLayout() == DataLayout::NHWC);
318  assert(outputShape.getLayout() == DataLayout::NHWC);
319  dout(2) << *kernels << "\n";
320 
321  {
322  auto stats = gem5::ScopedStats(
323  stats::kTensorPrepStart, stats::kTensorPrepEnd);
324  tiledTensors[0].copyDataToAllTiles();
325  tiledTensors[1].copyDataToAllTiles();
326  }
327 
328  runNHWC(tiledTensors[0], tiledTensors[1], tiledTensors[2]);
329 
330  {
331  auto stats = gem5::ScopedStats(
332  stats::kTensorFinalStart, stats::kTensorFinalEnd);
333  tiledTensors[2].untile();
334  }
335 }
336 
337 } // namespace smaug
smaug::Tensor
Tensor represents a single multi-dimensional array of data.
Definition: tensor.h:344
smaug::Tensor::data
const T * data() const
Returns a const pointer to the Tensor data.
Definition: tensor.h:521
smaug::numAcceleratorsAvailable
int numAcceleratorsAvailable
The actual number of accelerator complexes currently in use.
Definition: globals.cpp:6
activation_type
enum _activation_type activation_type
The activation function to apply to an operator's output in hardware.
smaug::dout
const DebugStream & dout(int debugLevel)
Returns a DebugStream instance for the given debug level.
Definition: debug_stream.cpp:16
smaug::TiledTensor::getTileWithData
Tensor * getTileWithData(int index)
Returns a Tensor at the specified tile position, with data copied from the original tensor.
Definition: tensor.cpp:65
ActivationInfo
Specifies an activation function and relevant parameters.
Definition: common.h:210
smaug::useSystolicArrayWhenAvailable
bool useSystolicArrayWhenAvailable
If true, uses the systolic array for applicable operators when backend support exists.
Definition: globals.cpp:8
smaug::SmvConvolutionOp::runNHWC
void runNHWC(TiledTensor &inputs, TiledTensor &weights, TiledTensor &outputs)
Tiling scheduler for this operator.
Definition: smv_convolution_op.cpp:19
smaug::TiledTensor
A multidimensional container of Tensors.
Definition: tensor.h:552
smaug::invokeKernelNoBlock
std::unique_ptr< volatile int > invokeKernelNoBlock(int accelIdx, unsigned reqCode, const Kernel &kernel, Args &&... args)
A generic non-blocking interface to accelerated kernel functions.
Definition: common.h:106
smaug::setArrayMemTypeIfSimulating
void setArrayMemTypeIfSimulating(unsigned reqCode, const char *arrayName, MemoryType memType)
Sets what memory access mechanism the accelerator will use when accessing this array.
Definition: common.cpp:21
_activation_param_t
Parameters to the activation function hardware.
Definition: common.h:194
smaug::runningInSimulation
bool runningInSimulation
This is true if the user chooses to run the network in gem5 simulation.
Definition: globals.cpp:4
smaug::TensorShape
TensorShape describes the shape of a Tensor.
Definition: tensor.h:35
smaug::SmvAcceleratorPool::addFinishFlag
void addFinishFlag(int accelIdx, std::unique_ptr< volatile int > finishFlag)
Add a finish flag for the specified accelerator.
Definition: smv_accel_pool.cpp:12
smv_conv3d_nhwc_vec_fxp
void smv_conv3d_nhwc_vec_fxp(float16 *host_inputs, float16 *host_weights, float16 *host_results, float *inputs, float *weights, float *results, int inputs_dims[4], int weights_dims[4], int results_dims[4], int inputs_align_pad, int weights_pad, int results_pad, int inputs_halo_pad[4], int row_stride, int col_stride, int ifmap_start, int kern_start, bool accumulate, bool read_inputs, bool read_weights, bool send_results, activation_type act_function, activation_param_t act_params, SamplingInfo *sampling)
Definition: convolution_simd.c:53
smaug::SmvAcceleratorPool::joinAll
void joinAll()
Wait until all the finish flags turn complete.
Definition: smv_accel_pool.cpp:32
smaug::ConvolutionOp< SmvBackend >::getInputPadding
std::vector< int > getInputPadding() const
Compute padding sizes on the row/column boundaries of the input feature map.
Definition: convolution_op.h:143
smaug
The smaug namespace is the parent namespace of all C++ code in SMAUG.
Definition: backend.cpp:38
common.h
Utilities for writing and invoking Aladdin kernels from Operators.
smaug::SmvAcceleratorPool
Implements a pool of worker accelerators.
Definition: smv_accel_pool.h:32
smaug::mapArrayToAccel
void mapArrayToAccel(unsigned reqCode, const char *arrayName, void *baseAddr, size_t size)
Maps an array of data to the accelerator.
Definition: common.cpp:12
smaug::SmvAcceleratorPool::getNextAvailableAccelerator
int getNextAvailableAccelerator(int currAccelIdx)
Get the next accelerator and wait if it's still busy.
Definition: smv_accel_pool.cpp:39