SMAUG
Simulating Machine Learning Applications on gem5-Aladdin
smv_convolution_tiling.cpp
1 #include <algorithm>
2 
3 #include "smaug/core/backend.h"
5 #include "smaug/operators/smv/smv_convolution_op.h"
6 #include "smaug/operators/smv/smv_convolution_tiling.h"
7 #include "smaug/utility/debug_stream.h"
8 
9 namespace smaug {
10 namespace smv {
11 namespace conv {
12 
13 std::array<TilingDims, 3> TilingOptimizer::determineBestTilingDims(
14  Tensor* inputs, Tensor* weights, Tensor* outputs, int maxTileSize) {
15  // Determine the best tiling strategy for each of inputs, weights, and
16  // outputs. Don't try to figure out the actual tile sizes yet.
17  TilingDims bestInputTilingDims =
18  findBestTilingDims(inputs->getShape(),
19  maxTileSize,
20  { 1, weights->getShape()[1],
21  inputs->getShape()[2], kNumMaccsPerPE });
22  TilingDims bestWeightTilingDims =
23  findBestTilingDims(weights->getShape(),
24  maxTileSize,
25  { kNumPEs, weights->getShape()[1],
26  weights->getShape()[2], kNumMaccsPerPE });
27  assert(bestWeightTilingDims != TilingDims::DimNH &&
28  "Weights cannot be tiled by dimensions NH!");
29  TilingDims bestOutputTilingDims =
30  findBestTilingDims(outputs->getShape(),
31  maxTileSize,
32  { 1, 1, outputs->getShape()[2], kNumPEs });
33 
34  // Apply some constraints to simplify tiling logic.
35  //
36  // If weights = DimN or DimNC, then outputs must be DimNC, so that we will
37  // copy out C channels of outputs after every tile. In theory, we could
38  // just keep more of the output pixels on the scratchpad and copy them only
39  // when it's actually full but that's harder to manage (what if it fills up
40  // in the middle of the next tile?).
41  if (needsNwiseTiling(bestWeightTilingDims))
42  bestOutputTilingDims = DimNC;
43 
44  // If inputs require rowwise tiling, then outputs also require rowwise
45  // tiling. Strictly speaking this is not necessarily required but it will
46  // greatly simplify memory management (see above).
47  if (needsHwiseTiling(bestInputTilingDims)) {
48  if (needsCwiseTiling(bestOutputTilingDims))
49  bestOutputTilingDims = DimNCH;
50  else
51  bestOutputTilingDims = DimNH;
52  }
53 
54  return { bestInputTilingDims, bestWeightTilingDims, bestOutputTilingDims };
55 }
56 
58  Tensor* inputs = op->getInput(op->Inputs);
59  Tensor* weights = op->getInput(op->Kernels);
60  Tensor* outputs = op->getOutput(op->Outputs);
61  int maxTileSize = SmvBackend::SpadSize() / inputs->getDataTypeSize();
62  std::array<TilingDims, 3> strategies =
63  determineBestTilingDims(inputs, weights, outputs, maxTileSize);
64  TilingDims inputTilingDims = strategies[0];
65  TilingDims weightTilingDims = strategies[1];
66  TilingDims outputTilingDims = strategies[2];
67 
68  dout(2) << " Tiling dimensions chosen:\n"
69  << " input: " << inputTilingDims
70  << ", weight: " << weightTilingDims
71  << ", output: " << outputTilingDims << "\n";
72 
73  TensorShape inputsShape = inputs->getShape();
74  TensorShape weightsShape = weights->getShape();
75  TensorShape outputsShape = outputs->getShape();
76 
77  // There are four degrees of freedom we can play with in total:
78  // N (batch), H (rows), C (channels), and P (ofmap).
79  // Each tiling strategy may reduce this down to just three.
80  // 1. Start with inputs. Enumerate all shapes that fit.
81  // 2. Move on to weights. Enumerate all shapes that are compatible with
82  // the input shape and fit.
83  // 3. Move on to outputs. If the weights don't need tiling, the outputs
84  // can be tiled independently; otherwise, based on the input and weights
85  // tile shapes, the output tile shape is completely determined.
86  // For all tiling strategy, compute the total SRAM utilization. The highest
87  // one is the chosen one.
88  std::vector<TensorShape> inputConfigs;
89  if (inputTilingDims == DimN) {
90  std::vector<int> minShape = inputsShape.dims();
91  minShape[0] = 1;
92  enum4DTensorTilingConfigs(inputsShape,
93  maxTileSize,
94  minShape,
95  { 1, 1, 1, 1 },
96  inputConfigs);
97  } else if (inputTilingDims == DimNC) {
98  std::vector<int> minShape = inputsShape.dims();
99  minShape[0] = 1;
100  minShape[3] = kNumMaccsPerPE;
101  enum4DTensorTilingConfigs(inputsShape,
102  maxTileSize,
103  minShape,
104  { 1, 1, 1, kNumMaccsPerPE },
105  inputConfigs);
106  } else if (inputTilingDims == DimNH) {
107  std::vector<int> minShape = inputsShape.dims();
108  minShape[0] = 1;
109  minShape[1] = weightsShape[1];
110  enum4DTensorTilingConfigs(inputsShape,
111  maxTileSize,
112  minShape,
113  { 1, op->getRowStride(), 1, 1 },
114  inputConfigs);
115  } else if (inputTilingDims == DimNCH) {
116  std::vector<int> minShape = { 1, weightsShape[1], inputsShape[2],
117  kNumMaccsPerPE };
118  std::vector<int> strides = { 1, op->getRowStride(), 1, kNumMaccsPerPE };
120  inputsShape, maxTileSize, minShape, strides, inputConfigs);
121  } else {
122  inputConfigs.push_back(inputsShape);
123  }
124  assert(!inputConfigs.empty() && "No tiling configurations found!");
125 
126  // Fill in weights.
127  std::list<TilingConfig> inputWeightConfigs;
128  for (auto it = inputConfigs.begin(); it != inputConfigs.end(); ++it) {
129  TensorShape& inputsShape = *it;
130  if (weightTilingDims == DimN) {
131  int minOfmaps = std::min(weightsShape[0], kNumPEs);
132  for (int n = minOfmaps; n <= weightsShape[0]; n += kNumPEs) {
133  TilingConfig config;
134  config.weights = weightsShape;
135  config.weights[0] = n;
136  config.weights[3] = inputsShape[3];
137  if (config.weights.storageSize() <= maxTileSize) {
138  config.inputs = inputsShape;
139  inputWeightConfigs.push_back(config);
140  } else {
141  break;
142  }
143  }
144  } else if (weightTilingDims == DimNC) {
145  int minOfmaps = std::min(weightsShape[0], kNumPEs);
146  int minChannels = std::min(weightsShape[3], kNumMaccsPerPE);
147  for (int n = minOfmaps; n <= weightsShape[0]; n += kNumPEs) {
148  TilingConfig config;
149  config.weights = weightsShape;
150  config.weights[0] = n;
151  if (needsCwiseTiling(inputTilingDims)) {
152  // If the inputs are also tiled channelwise, then the
153  // weights have to take the same channel dimension.
154  config.weights[3] = inputsShape[3];
155  if (config.weights.storageSize() <= maxTileSize) {
156  config.inputs = inputsShape;
157  inputWeightConfigs.push_back(config);
158  } else {
159  break;
160  }
161  } else {
162  // The weights can be independently tiled channelwise only
163  // if the inputs are not channelwise tiled.
164  for (int c = minChannels; c <= weightsShape[3];
165  c += kNumMaccsPerPE) {
166  config.weights[3] = c;
167  if (config.weights.storageSize() <= maxTileSize) {
168  config.inputs = inputsShape;
169  inputWeightConfigs.push_back(config);
170  } else {
171  break;
172  }
173  }
174  }
175  }
176  } else if (weightTilingDims == DimNH || weightTilingDims == DimNCH) {
177  assert(false && "Weights can't be tiled rowwise!");
178  } else {
179  TilingConfig config;
180  config.inputs = inputsShape;
181  config.weights = weightsShape;
182  if (needsCwiseTiling(inputTilingDims)) {
183  // This can happen with small weights. If the inputs are tiled
184  // channelwise, then the weight tile need to have the same
185  // number of channels.
186  config.weights[3] = inputsShape[3];
187  }
188  inputWeightConfigs.push_back(config);
189  }
190  }
191  assert(!inputWeightConfigs.empty() && "No tiling configurations found!");
192 
193  // Fill in outputs.
194  std::vector<TilingConfig> fullConfigs;
195  for (auto it = inputWeightConfigs.begin(); it != inputWeightConfigs.end();
196  ++it) {
197  int minChannels = std::min(it->weights[0], kNumPEs);
198  bool weightsNeedTiling = (weightTilingDims != None);
199  for (int c = minChannels; c <= weightsShape[0]; c += kNumPEs) {
200  TilingConfig config = *it;
201  config.outputs = outputsShape;
202  config.outputs[0] = config.inputs[0];
203  if (needsHwiseTiling(outputTilingDims)) {
204  int padding = op->getPadding() == SamePadding
205  ? FRAC_CEIL(config.weights[1] - 1, 2)
206  : 0;
207  config.outputs[1] = op->computeOutputDim(config.inputs[1],
208  config.weights[1],
209  op->getRowStride(),
210  padding);
211  config.outputs[3] = config.weights[0];
212  } else {
213  config.outputs[1] = outputsShape[1];
214  if (weightsNeedTiling)
215  config.outputs[3] = config.weights[0];
216  // If the weights don't need tiling and the outputs need tiling,
217  // the channel size of the output tile size can be determined
218  // independently.
219  else if (outputTilingDims != None)
220  config.outputs[3] = c;
221  }
222  if (config.outputs.storageSize() <= maxTileSize) {
223  fullConfigs.push_back(config);
224  }
225  // This means the output shape is uniquely determined, so we don't
226  // need to explore any other output channel values.
227  if (weightsNeedTiling || outputTilingDims == None)
228  break;
229  }
230  }
231  dout(2) << " Number of possible tiling configs: " << fullConfigs.size()
232  << "\n";
233  for (auto& config : fullConfigs)
234  dout(2) << " " << config << "\n";
235  auto maxIt = std::max_element(
236  fullConfigs.begin(),
237  fullConfigs.end(),
238  [](const TilingConfig& c1, const TilingConfig& c2) {
239  return c1.getTotalSize() < c2.getTotalSize();
240  });
241  assert(maxIt != fullConfigs.end() && "Failed to get best tiling config!");
242  // Fill in the tiling dims.
243  (*maxIt).inputTilingDims = inputTilingDims;
244  (*maxIt).weightTilingDims = weightTilingDims;
245  (*maxIt).outputTilingDims = outputTilingDims;
246  return *maxIt;
247 }
248 
250  SmvConvolutionOp* op,
251  const TiledTensor& inputTiledTensor,
252  const TiledTensor& weightsTiledTensor,
253  const TensorShape& maxOutputTileSize,
254  Tensor* outputTensor,
255  bool copyData) {
256  const TensorShape& inputShape = inputTiledTensor.getShape();
257  const TensorShape& weightsShape = weightsTiledTensor.getShape();
258  const TensorShape& outputShape = outputTensor->getShape();
259  int weightRows = op->getWeightRows();
260  int weightCols = op->getWeightCols();
261  bool samePadding = op->getPadding() == SamePadding;
262  // For even-sized filtered, FRAC_CEIL is needed to correctly handle padding.
263  std::vector<int> inputPadding = op->getInputPadding();
264  int topRowPad = inputPadding[0];
265  int bottomRowPad = inputPadding[1];
266  int leftColPad = inputPadding[2];
267  int rightColPad = inputPadding[3];
268  std::vector<int> numBlocksInDim{ inputShape[0], inputShape[1],
269  inputShape[2], weightsShape[0] };
270  // Due to stride > 1, there is a case where the last rowwise tile doesn't
271  // have enough rows for convolution. If so, we need to decrease the row
272  // dimension by 1 in the output tiled tensor.
273  int lastTileRows =
274  inputTiledTensor[inputTiledTensor.size() - 1]->getShape()[1];
275  if (lastTileRows + bottomRowPad < weightRows)
276  numBlocksInDim[1]--;
277  TiledTensor outputTiledTensor(
278  TensorShape(numBlocksInDim, inputShape.getLayout()), outputTensor);
279  const int ndims = outputShape.ndims();
280  std::vector<int> currentOrigin(ndims, 0);
281  auto inputIndex = inputTiledTensor.startIndex();
282  auto weightIndex = weightsTiledTensor.startIndex();
283  auto outputIndex = outputTiledTensor.startIndex();
284  for (int n = 0; n < numBlocksInDim[0]; n++) {
285  for (int h = 0; h < numBlocksInDim[1]; h++) {
286  for (int w = 0; w < numBlocksInDim[2]; w++) {
287  for (int c = 0; c < numBlocksInDim[3]; c++) {
288  const Tensor* inputTile =
289  inputTiledTensor[inputIndex(n, h, w, 0)];
290  const Tensor* weightsTile =
291  weightsTiledTensor[weightIndex(c, 0, 0, 0)];
292  const TensorShape& inputTileShape = inputTile->getShape();
293 
294  // DimNH tiling only affects rows, not columns.
295  int effInputRows = inputTileShape[1];
296  if (h == 0)
297  effInputRows += topRowPad;
298  else if (h == numBlocksInDim[1] - 1)
299  effInputRows += bottomRowPad;
300  int effInputCols =
301  inputTileShape[2] + leftColPad + rightColPad;
302  int outputRows = op->computeOutputDim(effInputRows,
303  weightRows,
304  op->getRowStride(),
305  ValidPadding);
306  int outputCols = op->computeOutputDim(effInputCols,
307  weightCols,
308  op->getColStride(),
309  ValidPadding);
310  TensorShape outputTileShape(
311  { inputTileShape[0], outputRows, outputCols,
312  weightsTile->getShape()[0] },
313  outputTensor->getShape().getLayout(),
314  SmvBackend::Alignment);
315  assert(outputTileShape.storageSize() <=
316  maxOutputTileSize.storageSize() &&
317  "DimNH input tiling results in output tile sizes "
318  "larger than the max tile size!");
319  int oi = outputIndex(n, h, w, c);
320  std::string tileName = op->getName() + ":" +
321  outputTensor->getName() +
322  "/tile:" + std::to_string((int)oi);
323  Tensor* outputTile = new Tensor(tileName, outputTileShape);
324  outputTile->allocateStorage(outputTensor->getDataType());
325  outputTiledTensor.setTile(
326  oi, currentOrigin, outputTile, copyData);
327  for (int i = ndims - 1; i >= 0; i--) {
328  currentOrigin[i] += outputTileShape[i];
329  if (currentOrigin[i] >= outputShape[i])
330  currentOrigin[i] = 0;
331  else
332  break;
333  }
334  }
335  }
336  }
337  }
338  op->getWorkspace()->addTiledTensor(outputTiledTensor);
339  dout(1) << " Tiled Tensor " << outputTensor->getName() << "(rowwise):\n"
340  << " original tensor shape: " << outputTensor->getShape() << "\n"
341  << " number of tiles: " << outputTiledTensor.size() << "\n";
342  return outputTiledTensor;
343 }
344 
345 std::array<TiledTensor, 3> TilingOptimizer::doTiling(SmvConvolutionOp* op) {
346  auto input = op->getInput(SmvConvolutionOp::Inputs);
347  auto kernels = op->getInput(SmvConvolutionOp::Kernels);
348  auto output = op->getOutput(SmvConvolutionOp::Outputs);
350  TiledTensor tiledInputs =
352  tileConfig.inputs,
353  op,
354  op->getWeightRows(),
355  op->getWeightCols(),
356  op->getRowStride(),
357  op->getColStride(),
358  op->getPadding());
359  // Copy data for the weight tiles since the data is read-only.
360  TiledTensor tiledWeights =
361  generateTiledTensor(kernels, tileConfig.weights,
362  op, /* copyData */ true);
363  TiledTensor tiledOutputs;
364  if (needsHwiseTiling(tileConfig.outputTilingDims)) {
366  op,
367  tiledInputs,
368  tiledWeights,
369  tileConfig.outputs,
370  output,
371  false);
372  } else {
373  tiledOutputs =
374  generateTiledTensor(output, tileConfig.outputs, op);
375  }
376  return { tiledInputs, tiledWeights, tiledOutputs };
377 }
378 
379 } // namespace conv
380 } // namespace smv
381 } // namespace smaug
smaug::Tensor
Tensor represents a single multi-dimensional array of data.
Definition: tensor.h:344
smaug::smv::TilingOptimizerBase::enum4DTensorTilingConfigs
static void enum4DTensorTilingConfigs(TensorShape shape, int maxTileSize, const std::vector< int > &minShape, const std::vector< int > &strides, std::vector< TensorShape > &configs)
Enumerates all tiling configs for a four dimensional Tensor.
Definition: smv_tiling_base.cpp:78
smaug::dout
const DebugStream & dout(int debugLevel)
Returns a DebugStream instance for the given debug level.
Definition: debug_stream.cpp:16
smaug::smv::conv::TilingOptimizer::computeBasicTileShapes
static TilingConfig computeBasicTileShapes(SmvConvolutionOp *op)
Determine the best basic tiling shape for this convolution layer.
Definition: smv_convolution_tiling.cpp:57
smaug::smv::TilingDims
TilingDims
The set of supported tiling strategies.
Definition: smv_tiling_common.h:13
smaug::Tensor::allocateStorage
T * allocateStorage()
Allocates memory to store Tensor data.
Definition: tensor.h:473
smaug::TiledTensor
A multidimensional container of Tensors.
Definition: tensor.h:552
FRAC_CEIL
#define FRAC_CEIL(A, B)
Implements the ceiling function of A/B.
Definition: common.h:505
smaug::TiledTensor::setTile
void setTile(int index, const std::vector< int > &origin, Tensor *tensor, bool copyData)
Set the specified tile to the provided Tensor, and optionally copy data into it.
Definition: tensor.cpp:71
smaug::TensorShape
TensorShape describes the shape of a Tensor.
Definition: tensor.h:35
smaug::generateTiledTensorWithStrideAndPadding
TiledTensor generateTiledTensorWithStrideAndPadding(Tensor *tensor, const TensorShape &tileShape, Operator *op, int fieldRows, int fieldCols, int rowStride, int colStride, PaddingType paddingType, bool copyData)
Generates a TiledTensor from a source Tensor with the specified tile shape.
Definition: tensor_utils.cpp:233
smaug::smv::TilingOptimizerBase::findBestTilingDims
static TilingDims findBestTilingDims(const TensorShape &shape, int maxTileSize, const std::vector< int > &minShape)
Find the best set of dimensions to tile a given tensor shape.
Definition: smv_tiling_base.cpp:10
smaug::smv::conv::TilingOptimizer::generateRowwiseOutputTiledTensor
static TiledTensor generateRowwiseOutputTiledTensor(SmvConvolutionOp *op, const TiledTensor &inputTiledTensor, const TiledTensor &weightsTiledTensor, const TensorShape &maxOutputTileSize, Tensor *outputTensor, bool copyData=false)
A specialized output tiling function when the output is tiled rowwise.
Definition: smv_convolution_tiling.cpp:249
smaug::ConvolutionOp::getInputPadding
std::vector< int > getInputPadding() const
Compute padding sizes on the row/column boundaries of the input feature map.
Definition: convolution_op.h:143
smaug::smv::TilingConfig
A TilingConfig describes tiling strategies and optimal tile sizes for inputs, weights,...
Definition: smv_tiling_common.h:29
smaug
The smaug namespace is the parent namespace of all C++ code in SMAUG.
Definition: backend.cpp:38
common.h
Utilities for writing and invoking Aladdin kernels from Operators.
smaug::smv::conv::TilingOptimizer::determineBestTilingDims
static std::array< TilingDims, 3 > determineBestTilingDims(Tensor *inputs, Tensor *weights, Tensor *outputs, int maxTileSize)
Determine the best tiling dimensions for running convolution on SMV.
Definition: smv_convolution_tiling.cpp:13
smaug::generateTiledTensor
TiledTensor generateTiledTensor(Tensor *tensor, const TensorShape &tileShape, Operator *op, bool copyData)
Generates a TiledTensor from a source Tensor.
Definition: tensor_utils.cpp:335
smaug::SmvConvolutionOp
SMV backend implementation of convolution.
Definition: smv_convolution_op.h:27