SMAUG
Simulating Machine Learning Applications on gem5-Aladdin
smv_inner_product_op.cpp
1 #include "smaug/core/backend.h"
3 #include "smaug/operators/smv/smv_inner_product_op.h"
4 #include "smaug/operators/smv/smv_inner_product_tiling.h"
5 #include "smaug/operators/smv/smv_kernels.h"
6 #include "smaug/operators/smv/smv_accel_pool.h"
7 #include "smaug/utility/debug_stream.h"
8 
9 namespace smaug {
10 namespace smv {
11 namespace fc {
12 
13 const int kNumPEs = 8;
14 const int kNumMaccsPerPE = 32;
15 
16 } // namespace fc
17 } // namespace smv
18 
19 // This function iterates the tiles generated by the tiling optimizer and send a
20 // tile triplet to the hardware kernel for computation. The tile iteration is in
21 // the following order:
22 // 1) N: batch-wise tiles in the inputs.
23 // 2) W: neuron-wise tiles in the weights.
24 // 3) A: activation-wise tiles in the inputs/weights.
25 void SmvInnerProductOp::runNWA(TiledTensor& inputs,
26  TiledTensor& weights,
27  TiledTensor& outputs) {
28  // Ordinarily, we don't need to tile the outputs. If this fails, it means
29  // the inner product has uncommonly large outputs, let's add the output
30  // iteration when that happens.
31  assert(outputs.size() == 1 &&
32  "Inner product outputs tiling not implemented yet!");
33  int inputNumTiles = inputs.getShape()[0];
34  int inputActTiles = inputs.getShape()[1];
35  int weightActTiles = weights.getShape()[1];
36  int weightNeuronTiles = weights.getShape()[0];
37  auto inputIdx = inputs.startIndex();
38  auto weightIdx = weights.startIndex();
39  auto outputIdx = outputs.startIndex();
40  for (int i = 0; i < numAcceleratorsAvailable; i++) {
42  smv::kInnerProductHw + i, "host_a", getInputsMemType());
44  smv::kInnerProductHw + i, "host_b", getWeightsMemType());
46  smv::kInnerProductHw + i, "host_results", getOutputsMemType());
47  }
48  SmvAcceleratorPool accelPool(numAcceleratorsAvailable);
49  std::vector<int> lastReadInputTileIdx(numAcceleratorsAvailable, -1);
50  int currAccelIdx = 0;
51  for (int N = 0; N < inputNumTiles; N++) {
52  // Usually we are constrained by weights whereas outputs can fit in the
53  // scratchpad. This keeps track of finished neurons and will be used by
54  // the kernel for correct offset in the outputs scratchpad.
55  int finishedNeurons = 0;
56  for (int W = 0; W < weightNeuronTiles; W++) {
57  // Up to this point, the loop nests do not have data dependency
58  // among themselves, and therefore we can run them in parallel. The
59  // loop nests beyond this level will need to run in serial, because
60  // the input/weight channelwise tiles iteration accumulate results
61  // to the same output tile.
62  int outputTileIdx = outputIdx(N, 0);
63  Tensor* outputTile = outputs[outputTileIdx];
64  const TensorShape& outputShape = outputTile->getShape();
65  mapArrayToAccel(smv::kInnerProductHw + currAccelIdx, "host_results",
66  outputTile->data<float16>(),
67  outputShape.storageSize() * sizeof(float16));
68  int iC = 0, wC = 0;
69  // This keeps track of the activation offset of the inputs.
70  int actOffset = 0;
71  while (iC < inputActTiles && wC < weightActTiles) {
72  int inputTileIdx = inputIdx(N, iC);
73  int weightTileIdx = weightIdx(W, wC);
74  // There is one condition on which the input tile has different
75  // number of activations from the weight tile: the inputs don't
76  // need tiling on activations while the weights do. In that
77  // case, we send the input tile once and keep the input tile
78  // stationary in the scrachpad, finishing the weight
79  // activation-wise tiles with multiple invocations.
80  dout(1) << "Input: " << inputTileIdx
81  << ", weights: " << weightTileIdx
82  << ", output: " << outputTileIdx << "\n";
83  Tensor* inputTile = inputs.getTileWithData(inputTileIdx);
84  Tensor* weightsTile = weights.getTileWithData(weightTileIdx);
85  const TensorShape& inputShape = inputTile->getShape();
86  const TensorShape& weightsShape = weightsTile->getShape();
87  mapArrayToAccel(smv::kInnerProductHw + currAccelIdx, "host_a",
88  inputTile->data<float16>(),
89  inputShape.storageSize() * sizeof(float16));
90  mapArrayToAccel(smv::kInnerProductHw + currAccelIdx, "host_b",
91  weightsTile->data<float16>(),
92  weightsShape.storageSize() * sizeof(float16));
93  int inputDims[2] = { inputShape[0], inputShape[1] };
94  int weightsDims[2] = { weightsShape[0], weightsShape[1] };
95  int outputDims[2] = { outputShape[0], outputShape[1] };
96  // If the input and weight tiles belong to the same channel
97  // group, then their data will be loaded at the same time into
98  // the spads, so we start from the beginning of the tile.
99  // Otherwise, we start from the last place we left off from.
100  int actStart = (iC == wC) ? 0 : actOffset;
101  // If the weights are tiled on activations, this should be set
102  // to true for non-first weight tiles to avoid resetting the
103  // result buffer.
104  bool accumulate = wC > 0;
105  // If this is a new input tile, then we need to read it.
106  bool readInputs = false;
107  if (inputTileIdx != lastReadInputTileIdx[currAccelIdx]) {
108  readInputs = true;
109  lastReadInputTileIdx[currAccelIdx] = inputTileIdx;
110  }
111  // We only need to send the results back to host memory in the
112  // very last invocation.
113  bool sendOutputs = (N == inputNumTiles - 1) &&
114  (W == weightNeuronTiles - 1) &&
115  (wC == weightActTiles - 1);
116 
117  std::unique_ptr<volatile int> finishFlag = invokeKernelNoBlock(
118  currAccelIdx, smv::kInnerProductHw + currAccelIdx,
120  inputTile->data<float16>(),
121  weightsTile->data<float16>(),
122  outputTile->data<float16>(), smv::spad0, smv::spad1,
123  smv::spad2, inputDims, weightsDims, outputDims,
124  inputShape.getPadding(1), weightsShape.getPadding(1),
125  outputShape.getPadding(1), actStart, finishedNeurons,
126  accumulate, readInputs, sendOutputs, actInfo.function,
127  actInfo.params, &sampling);
128  accelPool.addFinishFlag(currAccelIdx, std::move(finishFlag));
129 
130  actOffset += weightsTile->getShape()[1];
131  if (inputActTiles == weightActTiles) {
132  iC++;
133  wC++;
134  } else if (inputActTiles == 1) {
135  wC++;
136  } else {
137  assert(false && "The input/weight tiles can have different "
138  "number of channels only when the inputs "
139  "don't need activation-wise tiling.");
140  }
141  }
142  finishedNeurons += weights[weightIdx(W, 0)]->getShape()[0];
143  currAccelIdx = accelPool.getNextAvailableAccelerator(currAccelIdx);
144  }
145  }
146  // Before we leave, make sure all the accelerators have finished.
147  accelPool.joinAll();
148 }
149 
150 void SmvInnerProductOp::tile() {
151  // This function will tile (if necessary) the input/weight/output tensors
152  // of the inner product operator into smaller tensor tiles so that each tile
153  // can fit in the corresponding scratchpad of the accelerator.
154  tiledTensors = smaug::smv::fc::TilingOptimizer::doTiling(this);
155 }
156 
157 void SmvInnerProductOp::run() {
158  auto inputs = getInput(Inputs);
159  auto weights = getInput(Weights);
160  auto outputs = getOutput(Outputs);
161  const TensorShape& inputsShape = inputs->getShape();
162  const TensorShape& weightsShape = weights->getShape();
163  const TensorShape& outputsShape = outputs->getShape();
164  assert(inputsShape.getLayout() == DataLayout::NC);
165  assert(weightsShape.getLayout() == DataLayout::NC);
166  assert(outputsShape.getLayout() == DataLayout::NC);
167  dout(2) << *weights << "\n";
168 
169  {
170  auto stats = gem5::ScopedStats(
171  stats::kTensorPrepStart, stats::kTensorPrepEnd);
172  tiledTensors[0].copyDataToAllTiles();
173  tiledTensors[1].copyDataToAllTiles();
174  }
175 
176  runNWA(tiledTensors[0], tiledTensors[1], tiledTensors[2]);
177 
178  {
179  auto stats = gem5::ScopedStats(
180  stats::kTensorFinalStart, stats::kTensorFinalEnd);
181  tiledTensors[2].untile();
182  }
183 }
184 
185 } // namespace smaug
smaug::numAcceleratorsAvailable
int numAcceleratorsAvailable
The actual number of accelerator complexes currently in use.
Definition: globals.cpp:6
smaug::dout
const DebugStream & dout(int debugLevel)
Returns a DebugStream instance for the given debug level.
Definition: debug_stream.cpp:16
smaug::TiledTensor::getTileWithData
Tensor * getTileWithData(int index)
Returns a Tensor at the specified tile position, with data copied from the original tensor.
Definition: tensor.cpp:65
smaug::TiledTensor
A multidimensional container of Tensors.
Definition: tensor.h:552
smaug::invokeKernelNoBlock
std::unique_ptr< volatile int > invokeKernelNoBlock(int accelIdx, unsigned reqCode, const Kernel &kernel, Args &&... args)
A generic non-blocking interface to accelerated kernel functions.
Definition: common.h:106
smaug::setArrayMemTypeIfSimulating
void setArrayMemTypeIfSimulating(unsigned reqCode, const char *arrayName, MemoryType memType)
Sets what memory access mechanism the accelerator will use when accessing this array.
Definition: common.cpp:21
smv_matrix_multiply_transpose_nc_vec_fxp
void smv_matrix_multiply_transpose_nc_vec_fxp(float16 *host_a, float16 *host_b, float16 *host_results, float *a, float *b, float *results, int a_dims[2], int b_dims[2], int results_dims[2], int a_pad, int b_pad, int results_pad, int a_start, int result_start, bool accumulate, bool read_inputs, bool send_results, activation_type act_function, activation_param_t act_params, SamplingInfo *sampling)
Definition: matrix_multiply.c:59
smaug
The smaug namespace is the parent namespace of all C++ code in SMAUG.
Definition: backend.cpp:38
common.h
Utilities for writing and invoking Aladdin kernels from Operators.
smaug::mapArrayToAccel
void mapArrayToAccel(unsigned reqCode, const char *arrayName, void *baseAddr, size_t size)
Maps an array of data to the accelerator.
Definition: common.cpp:12