SMAUG
Simulating Machine Learning Applications on gem5-Aladdin
smv_pooling_op.cpp
1 #include "smaug/core/backend.h"
3 #include "smaug/operators/smv/smv_pooling_op.h"
4 #include "smaug/operators/smv/smv_pooling_tiling.h"
5 #include "smaug/operators/smv/smv_kernels.h"
6 #include "smaug/utility/debug_stream.h"
7 
8 namespace smaug {
9 namespace smv {
10 namespace pool {
11 
12 const int kVectorSize = 8;
13 
14 } // namespace pool
15 } // namespace smv
16 
17 // This function iterates the tiles generated by the tiling optimizer and send
18 // inputs/outputs tile duo to the hardware kernel for computation. The tile
19 // iteration is in the following order:
20 // 1) N: batch-wise tiles in the inputs.
21 // 2) H: Rowwise tiles in the inputs.
22 // 3) W: column-wise tiles in the inputs.
23 // 4) C: Channelwise tiles in the inputs/weights.
24 void SmvPoolingOp::runNHWC(TiledTensor& inputs, TiledTensor& outputs) {
25  int inputIfmapTiles = inputs.getShape()[0];
26  int inputRowTiles = inputs.getShape()[1];
27  int inputColTiles = inputs.getShape()[2];
28  int inputChanTiles = inputs.getShape()[3];
29  int outputChanTiles = outputs.getShape()[3];
30  auto inputIdx = inputs.startIndex();
31  auto outputIdx = outputs.startIndex();
33  smv::kPoolingHw, "host_inputs", getInputsMemType());
35  smv::kPoolingHw, "host_results", getOutputsMemType());
36  for (int N = 0; N < inputIfmapTiles; N++) {
37  for (int H = 0; H < inputRowTiles; H++) {
38  for (int W = 0; W < inputColTiles; W++) {
39  int iC = 0, oC = 0;
40  // This keeps track of the channel offset of the outputs.
41  int ofmapOffset = 0;
42  while (iC < inputChanTiles && oC < outputChanTiles) {
43  int inputTileIdx = inputIdx(N, H, W, iC);
44  int outputTileIdx = outputIdx(N, H, W, oC);
45  // If the outputs don't need tiling on channels whereas the
46  // inputs need it, the tiling optimizer allows the output
47  // tile to have different number of channels from the input
48  // tile.
49  dout(1) << "Input: " << inputTileIdx
50  << ", output: " << outputTileIdx << "\n";
51  Tensor* inputTile = inputs.getTileWithData(inputTileIdx);
52  Tensor* outputTile = outputs[outputTileIdx];
53  const TensorShape& inputShape = inputTile->getShape();
54  const TensorShape& outputShape = outputTile->getShape();
55  mapArrayToAccel(smv::kPoolingHw, "host_inputs",
56  inputTile->data<float16>(),
57  inputShape.storageSize() * sizeof(float16));
59  smv::kPoolingHw, "host_results",
60  outputTile->data<float16>(),
61  outputShape.storageSize() * sizeof(float16));
62  int inputDims[4] = { inputShape[0], inputShape[1],
63  inputShape[2], inputShape[3] };
64  int outputDims[4] = { outputShape[0], outputShape[1],
65  outputShape[2], outputShape[3] };
66  // If the input and output tiles belong to the same channel
67  // group, then their data will be loaded at the same time
68  // into the spads, so we start from the beginning of the
69  // tile. Otherwise, we start from the last place we left off
70  // from.
71  int ofmapStart = (iC == oC) ? 0 : ofmapOffset;
72 
74  smv::kPoolingHw,
75  opType == MaxPooling ? smv_maxpooling_nhwc_vec_fxp
77  inputTile->data<float16>(),
78  outputTile->data<float16>(), smv::spad0, smv::spad1,
79  inputDims, outputDims, inputShape.getPadding(3),
80  outputShape.getPadding(3), getPoolingSize().first,
81  getPoolingSize().second, getPoolingStride().first,
82  getPoolingStride().second, ofmapStart, &sampling);
83 
84  ofmapOffset += inputTile->getShape()[3];
85  if (inputChanTiles == outputChanTiles) {
86  iC++;
87  oC++;
88  } else if (outputChanTiles == 1) {
89  iC++;
90  } else {
91  assert(false &&
92  "The inputs/outputs tiles can have different "
93  "number of channels only when the outputs don't "
94  "need channelwise tiling.");
95  }
96  }
97  }
98  }
99  }
100 }
101 
102 void SmvPoolingOp::tile() {
103  // This function will tile (if necessary) the input/output tensors
104  // of the pooling operator into smaller tensor tiles so that each tile
105  // can fit in the corresponding scratchpad of the accelerator.
106  tiledTensors = smaug::smv::pool::TilingOptimizer::doTiling(this);
107 }
108 
110  auto input = getInput(Inputs);
111  auto output = getOutput(Outputs);
112  const TensorShape& inputShape = input->getShape();
113  const TensorShape& outputShape = output->getShape();
114  assert(inputShape.getLayout() == DataLayout::NHWC);
115  assert(outputShape.getLayout() == DataLayout::NHWC);
116 
117  {
118  auto stats = gem5::ScopedStats(
119  stats::kTensorPrepStart, stats::kTensorPrepEnd);
120  tiledTensors[0].copyDataToAllTiles();
121  }
122 
123  runNHWC(tiledTensors[0], tiledTensors[1]);
124 
125  {
126  auto stats = gem5::ScopedStats(
127  stats::kTensorFinalStart, stats::kTensorFinalEnd);
128  tiledTensors[1].untile();
129  }
130 }
131 
132 void SmvMaxPoolingOp::tile() { SmvPoolingOp::tile(); }
133 
134 void SmvAvgPoolingOp::tile() { SmvPoolingOp::tile(); }
135 
137 
139 
140 } // namespace smaug
141 
smaug::Tensor
Tensor represents a single multi-dimensional array of data.
Definition: tensor.h:344
smaug::gem5::ScopedStats
A RAII helper class which dumps and/or resets gem5 stats at construction and destruction.
Definition: utils.h:118
smaug::Tensor::data
const T * data() const
Returns a const pointer to the Tensor data.
Definition: tensor.h:521
smaug::dout
const DebugStream & dout(int debugLevel)
Returns a DebugStream instance for the given debug level.
Definition: debug_stream.cpp:16
smaug::SmvMaxPoolingOp::run
void run() override
Executes the Operator.
Definition: smv_pooling_op.cpp:136
smaug::TiledTensor
A multidimensional container of Tensors.
Definition: tensor.h:552
smaug::SmvAvgPoolingOp::run
void run() override
Executes the Operator.
Definition: smv_pooling_op.cpp:138
smaug::setArrayMemTypeIfSimulating
void setArrayMemTypeIfSimulating(unsigned reqCode, const char *arrayName, MemoryType memType)
Sets what memory access mechanism the accelerator will use when accessing this array.
Definition: common.cpp:21
smaug::TensorShape
TensorShape describes the shape of a Tensor.
Definition: tensor.h:35
smaug::Operator::outputs
std::vector< TensorBase * > outputs
An ordered list of output tensors produced by this operator.
Definition: operator.h:141
smaug::SmvPoolingOp::run
void run() override
Executes the Operator.
Definition: smv_pooling_op.cpp:109
smv_maxpooling_nhwc_vec_fxp
void smv_maxpooling_nhwc_vec_fxp(float16 *host_inputs, float16 *host_results, float *inputs, float *results, int inputs_dims[4], int results_dims[4], int inputs_pad, int results_pad, int pool_rows, int pool_cols, int row_stride, int col_stride, int ofmap_start, SamplingInfo *sampling)
Definition: pooling.c:36
smaug
The smaug namespace is the parent namespace of all C++ code in SMAUG.
Definition: backend.cpp:38
common.h
Utilities for writing and invoking Aladdin kernels from Operators.
smv_avgpooling_nhwc_vec_fxp
void smv_avgpooling_nhwc_vec_fxp(float16 *host_inputs, float16 *host_results, float *inputs, float *results, int inputs_dims[4], int results_dims[4], int inputs_pad, int results_pad, int pool_rows, int pool_cols, int row_stride, int col_stride, int ofmap_start, SamplingInfo *sampling)
Definition: pooling.c:166
smaug::mapArrayToAccel
void mapArrayToAccel(unsigned reqCode, const char *arrayName, void *baseAddr, size_t size)
Maps an array of data to the accelerator.
Definition: common.cpp:12
smaug::invokeKernel
void invokeKernel(int accelIdx, unsigned reqCode, const Kernel &kernel, Args &&... args)
The generic blocking interface for all accelerator kernel functions.
Definition: common.h:72
smaug::Operator::inputs
std::vector< TensorBase * > inputs
An ordered list of input tensors consumed by this operator.
Definition: operator.h:134