3 #include "smaug/core/backend.h"
5 #include "smaug/operators/smv/smv_convolution_op.h"
6 #include "smaug/operators/smv/smv_convolution_tiling.h"
7 #include "smaug/utility/debug_stream.h"
20 { 1, weights->getShape()[1],
21 inputs->getShape()[2], kNumMaccsPerPE });
25 { kNumPEs, weights->getShape()[1],
26 weights->getShape()[2], kNumMaccsPerPE });
27 assert(bestWeightTilingDims != TilingDims::DimNH &&
28 "Weights cannot be tiled by dimensions NH!");
32 { 1, 1, outputs->getShape()[2], kNumPEs });
41 if (needsNwiseTiling(bestWeightTilingDims))
42 bestOutputTilingDims = DimNC;
47 if (needsHwiseTiling(bestInputTilingDims)) {
48 if (needsCwiseTiling(bestOutputTilingDims))
49 bestOutputTilingDims = DimNCH;
51 bestOutputTilingDims = DimNH;
54 return { bestInputTilingDims, bestWeightTilingDims, bestOutputTilingDims };
58 Tensor* inputs = op->getInput(op->Inputs);
59 Tensor* weights = op->getInput(op->Kernels);
60 Tensor* outputs = op->getOutput(op->Outputs);
61 int maxTileSize = SmvBackend::SpadSize() / inputs->getDataTypeSize();
62 std::array<TilingDims, 3> strategies =
68 dout(2) <<
" Tiling dimensions chosen:\n"
69 <<
" input: " << inputTilingDims
70 <<
", weight: " << weightTilingDims
71 <<
", output: " << outputTilingDims <<
"\n";
88 std::vector<TensorShape> inputConfigs;
89 if (inputTilingDims == DimN) {
90 std::vector<int> minShape = inputsShape.dims();
97 }
else if (inputTilingDims == DimNC) {
98 std::vector<int> minShape = inputsShape.dims();
100 minShape[3] = kNumMaccsPerPE;
104 { 1, 1, 1, kNumMaccsPerPE },
106 }
else if (inputTilingDims == DimNH) {
107 std::vector<int> minShape = inputsShape.dims();
109 minShape[1] = weightsShape[1];
113 { 1, op->getRowStride(), 1, 1 },
115 }
else if (inputTilingDims == DimNCH) {
116 std::vector<int> minShape = { 1, weightsShape[1], inputsShape[2],
118 std::vector<int> strides = { 1, op->getRowStride(), 1, kNumMaccsPerPE };
120 inputsShape, maxTileSize, minShape, strides, inputConfigs);
122 inputConfigs.push_back(inputsShape);
124 assert(!inputConfigs.empty() &&
"No tiling configurations found!");
127 std::list<TilingConfig> inputWeightConfigs;
128 for (
auto it = inputConfigs.begin(); it != inputConfigs.end(); ++it) {
130 if (weightTilingDims == DimN) {
131 int minOfmaps = std::min(weightsShape[0], kNumPEs);
132 for (
int n = minOfmaps; n <= weightsShape[0]; n += kNumPEs) {
134 config.weights = weightsShape;
135 config.weights[0] = n;
136 config.weights[3] = inputsShape[3];
137 if (config.weights.storageSize() <= maxTileSize) {
138 config.inputs = inputsShape;
139 inputWeightConfigs.push_back(config);
144 }
else if (weightTilingDims == DimNC) {
145 int minOfmaps = std::min(weightsShape[0], kNumPEs);
146 int minChannels = std::min(weightsShape[3], kNumMaccsPerPE);
147 for (
int n = minOfmaps; n <= weightsShape[0]; n += kNumPEs) {
149 config.weights = weightsShape;
150 config.weights[0] = n;
151 if (needsCwiseTiling(inputTilingDims)) {
154 config.weights[3] = inputsShape[3];
155 if (config.weights.storageSize() <= maxTileSize) {
156 config.inputs = inputsShape;
157 inputWeightConfigs.push_back(config);
164 for (
int c = minChannels; c <= weightsShape[3];
165 c += kNumMaccsPerPE) {
166 config.weights[3] = c;
167 if (config.weights.storageSize() <= maxTileSize) {
168 config.inputs = inputsShape;
169 inputWeightConfigs.push_back(config);
176 }
else if (weightTilingDims == DimNH || weightTilingDims == DimNCH) {
177 assert(
false &&
"Weights can't be tiled rowwise!");
180 config.inputs = inputsShape;
181 config.weights = weightsShape;
182 if (needsCwiseTiling(inputTilingDims)) {
186 config.weights[3] = inputsShape[3];
188 inputWeightConfigs.push_back(config);
191 assert(!inputWeightConfigs.empty() &&
"No tiling configurations found!");
194 std::vector<TilingConfig> fullConfigs;
195 for (
auto it = inputWeightConfigs.begin(); it != inputWeightConfigs.end();
197 int minChannels = std::min(it->weights[0], kNumPEs);
198 bool weightsNeedTiling = (weightTilingDims != None);
199 for (
int c = minChannels; c <= weightsShape[0]; c += kNumPEs) {
201 config.outputs = outputsShape;
202 config.outputs[0] = config.inputs[0];
203 if (needsHwiseTiling(outputTilingDims)) {
204 int padding = op->getPadding() == SamePadding
207 config.outputs[1] = op->computeOutputDim(config.inputs[1],
211 config.outputs[3] = config.weights[0];
213 config.outputs[1] = outputsShape[1];
214 if (weightsNeedTiling)
215 config.outputs[3] = config.weights[0];
219 else if (outputTilingDims != None)
220 config.outputs[3] = c;
222 if (config.outputs.storageSize() <= maxTileSize) {
223 fullConfigs.push_back(config);
227 if (weightsNeedTiling || outputTilingDims == None)
231 dout(2) <<
" Number of possible tiling configs: " << fullConfigs.size()
233 for (
auto& config : fullConfigs)
234 dout(2) <<
" " << config <<
"\n";
235 auto maxIt = std::max_element(
239 return c1.getTotalSize() < c2.getTotalSize();
241 assert(maxIt != fullConfigs.end() &&
"Failed to get best tiling config!");
243 (*maxIt).inputTilingDims = inputTilingDims;
244 (*maxIt).weightTilingDims = weightTilingDims;
245 (*maxIt).outputTilingDims = outputTilingDims;
256 const TensorShape& inputShape = inputTiledTensor.getShape();
257 const TensorShape& weightsShape = weightsTiledTensor.getShape();
258 const TensorShape& outputShape = outputTensor->getShape();
259 int weightRows = op->getWeightRows();
260 int weightCols = op->getWeightCols();
261 bool samePadding = op->getPadding() == SamePadding;
264 int topRowPad = inputPadding[0];
265 int bottomRowPad = inputPadding[1];
266 int leftColPad = inputPadding[2];
267 int rightColPad = inputPadding[3];
268 std::vector<int> numBlocksInDim{ inputShape[0], inputShape[1],
269 inputShape[2], weightsShape[0] };
274 inputTiledTensor[inputTiledTensor.size() - 1]->getShape()[1];
275 if (lastTileRows + bottomRowPad < weightRows)
278 TensorShape(numBlocksInDim, inputShape.getLayout()), outputTensor);
279 const int ndims = outputShape.ndims();
280 std::vector<int> currentOrigin(ndims, 0);
281 auto inputIndex = inputTiledTensor.startIndex();
282 auto weightIndex = weightsTiledTensor.startIndex();
283 auto outputIndex = outputTiledTensor.startIndex();
284 for (
int n = 0; n < numBlocksInDim[0]; n++) {
285 for (
int h = 0; h < numBlocksInDim[1]; h++) {
286 for (
int w = 0; w < numBlocksInDim[2]; w++) {
287 for (
int c = 0; c < numBlocksInDim[3]; c++) {
289 inputTiledTensor[inputIndex(n, h, w, 0)];
290 const Tensor* weightsTile =
291 weightsTiledTensor[weightIndex(c, 0, 0, 0)];
292 const TensorShape& inputTileShape = inputTile->getShape();
295 int effInputRows = inputTileShape[1];
297 effInputRows += topRowPad;
298 else if (h == numBlocksInDim[1] - 1)
299 effInputRows += bottomRowPad;
301 inputTileShape[2] + leftColPad + rightColPad;
302 int outputRows = op->computeOutputDim(effInputRows,
306 int outputCols = op->computeOutputDim(effInputCols,
311 { inputTileShape[0], outputRows, outputCols,
312 weightsTile->getShape()[0] },
313 outputTensor->getShape().getLayout(),
314 SmvBackend::Alignment);
315 assert(outputTileShape.storageSize() <=
316 maxOutputTileSize.storageSize() &&
317 "DimNH input tiling results in output tile sizes "
318 "larger than the max tile size!");
319 int oi = outputIndex(n, h, w, c);
320 std::string tileName = op->getName() +
":" +
321 outputTensor->getName() +
322 "/tile:" + std::to_string((
int)oi);
323 Tensor* outputTile =
new Tensor(tileName, outputTileShape);
326 oi, currentOrigin, outputTile, copyData);
327 for (
int i = ndims - 1; i >= 0; i--) {
328 currentOrigin[i] += outputTileShape[i];
329 if (currentOrigin[i] >= outputShape[i])
330 currentOrigin[i] = 0;
338 op->getWorkspace()->addTiledTensor(outputTiledTensor);
339 dout(1) <<
" Tiled Tensor " << outputTensor->getName() <<
"(rowwise):\n"
340 <<
" original tensor shape: " << outputTensor->getShape() <<
"\n"
341 <<
" number of tiles: " << outputTiledTensor.size() <<
"\n";
342 return outputTiledTensor;
345 std::array<TiledTensor, 3> TilingOptimizer::doTiling(
SmvConvolutionOp* op) {
346 auto input = op->getInput(SmvConvolutionOp::Inputs);
347 auto kernels = op->getInput(SmvConvolutionOp::Kernels);
348 auto output = op->getOutput(SmvConvolutionOp::Outputs);
364 if (needsHwiseTiling(tileConfig.outputTilingDims)) {
376 return { tiledInputs, tiledWeights, tiledOutputs };