SMAUG
Simulating Machine Learning Applications on gem5-Aladdin
smv_inner_product_tiling.cpp
1 #include <algorithm>
2 
3 #include "smaug/core/backend.h"
5 #include "smaug/operators/smv/smv_inner_product_op.h"
6 #include "smaug/operators/smv/smv_inner_product_tiling.h"
7 #include "smaug/utility/debug_stream.h"
8 
9 namespace smaug {
10 namespace smv {
11 namespace fc {
12 
13 std::array<TilingDims, 3> TilingOptimizer::determineBestTilingDims(
14  Tensor* inputs, Tensor* weights, Tensor* outputs, int maxTileSize) {
15  // Determine the best tiling strategy for each of inputs, weights, and
16  // outputs. Don't try to figure out the actual tile sizes yet.
17  TilingDims bestInputTilingDims = findBestTilingDims(
18  inputs->getShape(), maxTileSize, { 1, kNumMaccsPerPE });
19  TilingDims bestWeightTilingDims = findBestTilingDims(
20  weights->getShape(), maxTileSize, { kNumPEs, kNumMaccsPerPE });
21  TilingDims bestOutputTilingDims = findBestTilingDims(
22  outputs->getShape(), maxTileSize, { 1, kNumPEs });
23 
24  // Apply some constraints to simplify tiling logic.
25  //
26  // If weights require tiling on neurons, then outputs must be DimNC (if
27  // outputs require tiling), so that we will copy out C neurons of outputs
28  // after every tile.
29  if (needsNwiseTiling(bestWeightTilingDims) && bestOutputTilingDims != None)
30  bestOutputTilingDims = DimNC;
31 
32  return { bestInputTilingDims, bestWeightTilingDims, bestOutputTilingDims };
33 }
34 
36  Tensor* inputs = op->getInput(op->Inputs);
37  Tensor* weights = op->getInput(op->Weights);
38  Tensor* outputs = op->getOutput(op->Outputs);
39  int maxTileSize = SmvBackend::SpadSize() / inputs->getDataTypeSize();
40  std::array<TilingDims, 3> strategies =
41  determineBestTilingDims(inputs, weights, outputs, maxTileSize);
42  TilingDims inputTilingDims = strategies[0];
43  TilingDims weightTilingDims = strategies[1];
44  TilingDims outputTilingDims = strategies[2];
45 
46  dout(1) << " Tiling dimensions chosen:\n"
47  << " input: " << inputTilingDims
48  << ", weight: " << weightTilingDims
49  << ", output: " << outputTilingDims << "\n";
50 
51  TensorShape inputsShape = inputs->getShape();
52  TensorShape weightsShape = weights->getShape();
53  TensorShape outputsShape = outputs->getShape();
54 
55  // There are two degrees of freedom we can play with in total:
56  // N (batch/neuron), C (activation).
57  // Each tiling strategy may reduce this down to just three.
58  // 1. Start with inputs. Enumerate all shapes that fit.
59  // 2. Move on to weights. Enumerate all shapes that are compatible with
60  // the input shape and fit.
61  // 3. Move on to outputs. If the weights don't need tiling, the outputs
62  // can be tiled independently; otherwise, based on the input and weights
63  // tile shapes, the output tile shape is completely determined.
64  // For all tiling strategy, compute the total SRAM utilization. The highest
65  // one is the chosen one.
66  std::vector<TensorShape> inputConfigs;
67  if (inputTilingDims == DimN) {
68  enum2DTensorTilingConfigs(inputsShape,
69  maxTileSize,
70  { 1, inputsShape[1] },
71  { 1, 1 },
72  inputConfigs);
73  } else if (inputTilingDims == DimNC) {
74  std::vector<int> minShape = inputsShape.dims();
75  enum2DTensorTilingConfigs(inputsShape,
76  maxTileSize,
77  { 1, kNumMaccsPerPE },
78  { 1, kNumMaccsPerPE },
79  inputConfigs);
80 
81  } else {
82  inputConfigs.push_back(inputsShape);
83  }
84  assert(!inputConfigs.empty() && "No tiling configurations found!");
85 
86  // Fill in weights.
87  std::list<TilingConfig> inputWeightConfigs;
88  for (auto it = inputConfigs.begin(); it != inputConfigs.end(); ++it) {
89  const TensorShape& inputsShape = *it;
90  if (weightTilingDims == DimN) {
91  int minOfmaps = std::min(weightsShape[0], kNumPEs);
92  for (int n = minOfmaps; n <= weightsShape[0]; n += kNumPEs) {
93  TilingConfig config;
94  config.weights = TensorShape({ n, inputsShape[1] },
95  inputsShape.getLayout(),
96  SmvBackend::Alignment);
97  if (config.weights.storageSize() <= maxTileSize) {
98  config.inputs = inputsShape;
99  inputWeightConfigs.push_back(config);
100  } else {
101  break;
102  }
103  }
104  } else if (weightTilingDims == DimNC) {
105  int minNeurons = std::min(weightsShape[0], kNumPEs);
106  int minActs = std::min(weightsShape[1], kNumMaccsPerPE);
107  for (int n = minNeurons; n <= weightsShape[0]; n += kNumPEs) {
108  TilingConfig config;
109  config.weights = weightsShape;
110  config.weights[0] = n;
111  if (needsCwiseTiling(inputTilingDims)) {
112  // If the inputs are also tiled activation-wise, then the
113  // weights have to take the same activations dimension.
114  config.weights[1] = inputsShape[1];
115  if (config.weights.storageSize() <= maxTileSize) {
116  config.inputs = inputsShape;
117  inputWeightConfigs.push_back(config);
118  } else {
119  break;
120  }
121  } else {
122  // The weights can be independently tiled activation-wise
123  // only if the inputs are not tiled on activations.
124  for (int c = minActs; c <= weightsShape[1];
125  c += kNumMaccsPerPE) {
126  config.weights[1] = c;
127  if (config.weights.storageSize() <= maxTileSize) {
128  config.inputs = inputsShape;
129  inputWeightConfigs.push_back(config);
130  } else {
131  break;
132  }
133  }
134  }
135  }
136  } else {
137  TilingConfig config;
138  config.inputs = inputsShape;
139  config.weights = weightsShape;
140  if (needsCwiseTiling(inputTilingDims)) {
141  // This can happen with small weights. If the inputs are tiled
142  // channelwise, then the weight tile need to have the same
143  // number of channels.
144  config.weights[1] = inputsShape[1];
145  }
146  inputWeightConfigs.push_back(config);
147  }
148  }
149  assert(!inputWeightConfigs.empty() && "No tiling configurations found!");
150 
151  // Fill in outputs.
152  std::vector<TilingConfig> fullConfigs;
153  for (auto it = inputWeightConfigs.begin(); it != inputWeightConfigs.end();
154  ++it) {
155  int minChannels = std::min(it->weights[0], kNumPEs);
156  bool weightsNeedTiling = (weightTilingDims != None);
157  bool outputsNeedTiling = (outputTilingDims != None);
158  for (int c = minChannels; c <= weightsShape[0]; c += kNumPEs) {
159  TilingConfig config = *it;
160  config.outputs = outputsShape;
161  config.outputs[0] = config.inputs[0];
162  if (weightsNeedTiling && outputsNeedTiling) {
163  config.outputs[1] = config.weights[0];
164  } else if (outputsNeedTiling) {
165  // This could rarely happen, but for completeness let's keep it.
166  // If the weights don't need tiling and the outputs need tiling,
167  // the channel size of the output tile size can be determined
168  // independently.
169  config.outputs[1] = c;
170  }
171  if (config.outputs.storageSize() <= maxTileSize) {
172  fullConfigs.push_back(config);
173  }
174  // This means the output shape is uniquely determined, so we don't
175  // need to explore any other output channel values.
176  if (weightsNeedTiling || outputTilingDims == None)
177  break;
178  }
179  }
180  dout(2) << " Number of possible tiling configs: " << fullConfigs.size()
181  << "\n";
182  for (auto& config : fullConfigs)
183  dout(2) << " " << config << "\n";
184  auto maxIt = std::max_element(
185  fullConfigs.begin(),
186  fullConfigs.end(),
187  [](const TilingConfig& c1, const TilingConfig& c2) {
188  return c1.getTotalSize() < c2.getTotalSize();
189  });
190  assert(maxIt != fullConfigs.end() && "Failed to get best tiling config!");
191  // Fill in the tiling dims.
192  maxIt->inputTilingDims = inputTilingDims;
193  maxIt->weightTilingDims = weightTilingDims;
194  maxIt->outputTilingDims = outputTilingDims;
195  return *maxIt;
196 }
197 
198 std::array<TiledTensor, 3> TilingOptimizer::doTiling(SmvInnerProductOp* op) {
199  auto input = op->getInput(SmvInnerProductOp::Inputs);
200  auto kernels = op->getInput(SmvInnerProductOp::Weights);
201  auto output = op->getOutput(SmvInnerProductOp::Outputs);
203  TiledTensor tiledInputs =
204  generateTiledTensor(input, tileConfig.inputs, op, /* copy_data*/ false);
205  // Copy data for the weight tiles since the data is read-only.
206  TiledTensor tiledWeights =
207  generateTiledTensor(kernels, tileConfig.weights, op);
208  tiledWeights.copyDataToAllTiles();
209  TiledTensor tiledOutputs =
210  generateTiledTensor(output, tileConfig.outputs, op, /* copy_data */ false);
211  return { tiledInputs, tiledWeights, tiledOutputs };
212 }
213 
214 } // namespace fc
215 } // namespace smv
216 } // namespace smaug
smaug::Tensor
Tensor represents a single multi-dimensional array of data.
Definition: tensor.h:344
smaug::dout
const DebugStream & dout(int debugLevel)
Returns a DebugStream instance for the given debug level.
Definition: debug_stream.cpp:16
smaug::smv::TilingDims
TilingDims
The set of supported tiling strategies.
Definition: smv_tiling_common.h:13
smaug::smv::fc::TilingOptimizer::computeBasicTileShapes
static TilingConfig computeBasicTileShapes(SmvInnerProductOp *op)
Determine the best basic tiling shape for this fc layer without bias.
Definition: smv_inner_product_tiling.cpp:35
smaug::smv::TilingOptimizerBase::enum2DTensorTilingConfigs
static void enum2DTensorTilingConfigs(TensorShape shape, int maxTileSize, const std::vector< int > &minShape, const std::vector< int > &strides, std::vector< TensorShape > &configs)
Enumerates all tiling configs for a two dimensional Tensor.
Definition: smv_tiling_base.cpp:56
smaug::TiledTensor
A multidimensional container of Tensors.
Definition: tensor.h:552
smaug::smv::fc::TilingOptimizer::determineBestTilingDims
static std::array< TilingDims, 3 > determineBestTilingDims(Tensor *inputs, Tensor *weights, Tensor *outputs, int maxTileSize)
Determine the best tiling dimensions for running inner product on SMV.
Definition: smv_inner_product_tiling.cpp:13
smaug::TensorShape
TensorShape describes the shape of a Tensor.
Definition: tensor.h:35
smaug::smv::TilingOptimizerBase::findBestTilingDims
static TilingDims findBestTilingDims(const TensorShape &shape, int maxTileSize, const std::vector< int > &minShape)
Find the best set of dimensions to tile a given tensor shape.
Definition: smv_tiling_base.cpp:10
smaug::smv::TilingConfig
A TilingConfig describes tiling strategies and optimal tile sizes for inputs, weights,...
Definition: smv_tiling_common.h:29
smaug
The smaug namespace is the parent namespace of all C++ code in SMAUG.
Definition: backend.cpp:38
common.h
Utilities for writing and invoking Aladdin kernels from Operators.
smaug::SmvInnerProductOp
Inner product operator on SMV.
Definition: smv_inner_product_op.h:28
smaug::generateTiledTensor
TiledTensor generateTiledTensor(Tensor *tensor, const TensorShape &tileShape, Operator *op, bool copyData)
Generates a TiledTensor from a source Tensor.
Definition: tensor_utils.cpp:335
smaug::TiledTensor::copyDataToAllTiles
void copyDataToAllTiles()
Copies data (if needed) to all the tiles from the original Tensor.
Definition: tensor.cpp:116