SMAUG
Simulating Machine Learning Applications on gem5-Aladdin
tensor_utils.cpp
1 #include <iostream>
2 
3 #include "fp16.h"
4 #include "smaug/core/tensor.h"
6 #include "smaug/core/workspace.h"
7 #include "smaug/utility/debug_stream.h"
8 
9 namespace smaug {
10 
11 template <>
12 void printTensorElement<float16>(std::ostream& os,
13  const float16* data,
14  int index) {
15  os << fp16_ieee_to_fp32_value(data[index]);
16 }
17 
18 std::ostream& operator<<(std::ostream& os, const TensorShape& shape) {
19  os << "(";
20  for (int i = 0; i < shape.ndims(); i++) {
21  os << shape[i];
22  if (i != shape.ndims() - 1)
23  os << ", ";
24  }
25  os << ")";
26  return os;
27 }
28 
29 std::ostream& operator<<(std::ostream& os, const TensorIndexIterator& iter) {
30  os << "( ";
31  for (int i = 0; i < iter.dims.size(); ++i) {
32  os << iter.state[i] << " ";
33  }
34  os << ")";
35  return os;
36 }
37 
38 std::ostream& operator<<(std::ostream& os, const Tensor& tensor) {
39  DataType type = tensor.getDataType();
40  switch (type) {
41  case Float16:
42  writeTensorToOstream<uint16_t>(os, tensor);
43  break;
44  case Float32:
45  writeTensorToOstream<float>(os, tensor);
46  break;
47  case Float64:
48  writeTensorToOstream<double>(os, tensor);
49  break;
50  case Int32:
51  writeTensorToOstream<int>(os, tensor);
52  break;
53  case Int64:
54  writeTensorToOstream<int64_t>(os, tensor);
55  break;
56  case Bool:
57  writeTensorToOstream<bool>(os, tensor);
58  break;
59  default:
60  assert(false && "Unknown data type!");
61  }
62  return os;
63 }
64 
66  Tensor* src,
67  std::vector<int> destOrigin,
68  std::vector<int> srcOrigin,
69  std::vector<int> regionSize) {
70  assert(dest->ndims() == src->ndims());
71  assert(dest->getDataType() == src->getDataType());
72  switch (dest->getDataType()) {
73  case Float16:
74  internal::copyTensorRegion<uint16_t>(
75  dest, src, destOrigin, srcOrigin, regionSize);
76  break;
77  case Float32:
78  internal::copyTensorRegion<float>(
79  dest, src, destOrigin, srcOrigin, regionSize);
80  break;
81  case Float64:
82  internal::copyTensorRegion<double>(
83  dest, src, destOrigin, srcOrigin, regionSize);
84  break;
85  case Int32:
86  internal::copyTensorRegion<int>(
87  dest, src, destOrigin, srcOrigin, regionSize);
88  break;
89  case Int64:
90  internal::copyTensorRegion<int64_t>(
91  dest, src, destOrigin, srcOrigin, regionSize);
92  break;
93  case Bool:
94  internal::copyTensorRegion<bool>(
95  dest, src, destOrigin, srcOrigin, regionSize);
96  break;
97  default:
98  assert(false && "Unknown data type!");
99  }
100 }
101 
103  Tensor* src,
104  std::vector<int> destOrigin,
105  std::vector<int> srcOrigin,
106  int copySize) {
107  assert(dest->getDataType() == src->getDataType());
108  switch (dest->getDataType()) {
109  case Float16:
110  internal::copyTensorData<uint16_t>(
111  dest, src, destOrigin, srcOrigin, copySize);
112  break;
113  case Float32:
114  internal::copyTensorData<float>(
115  dest, src, destOrigin, srcOrigin, copySize);
116  break;
117  case Float64:
118  internal::copyTensorData<double>(
119  dest, src, destOrigin, srcOrigin, copySize);
120  break;
121  case Int32:
122  internal::copyTensorData<int>(
123  dest, src, destOrigin, srcOrigin, copySize);
124  break;
125  case Int64:
126  internal::copyTensorData<int64_t>(
127  dest, src, destOrigin, srcOrigin, copySize);
128  break;
129  case Bool:
130  internal::copyTensorData<bool>(
131  dest, src, destOrigin, srcOrigin, copySize);
132  break;
133  default:
134  assert(false && "Unknown data type!");
135  }
136 }
137 
139  Tensor* src,
140  int destOffset,
141  int srcOffset,
142  int copySize) {
143  assert(dest->getDataType() == src->getDataType());
144  switch (dest->getDataType()) {
145  case Float16:
146  internal::copyRawTensorData<uint16_t>(
147  dest, src, destOffset, srcOffset, copySize);
148  break;
149  case Float32:
150  internal::copyRawTensorData<float>(
151  dest, src, destOffset, srcOffset, copySize);
152  break;
153  case Float64:
154  internal::copyRawTensorData<double>(
155  dest, src, destOffset, srcOffset, copySize);
156  break;
157  case Int32:
158  internal::copyRawTensorData<int>(
159  dest, src, destOffset, srcOffset, copySize);
160  break;
161  case Int64:
162  internal::copyRawTensorData<int64_t>(
163  dest, src, destOffset, srcOffset, copySize);
164  break;
165  case Bool:
166  internal::copyRawTensorData<bool>(
167  dest, src, destOffset, srcOffset, copySize);
168  break;
169  default:
170  assert(false && "Unknown data type!");
171  }
172 }
173 
174 namespace internal {
175 // Compute the tile size in this dimension with padding accounted for. The goal
176 // is to get the tile dimension size that doesn't have any elements unused,
177 // given the padding, weight and stride sizes.
178 //
179 // Args:
180 // maxTileDim: Maximum size of the tile size in this dimension.
181 // padding: Padding in the dimension.
182 // weightDim: Weight size in this dimension.
183 // stride: Stride size in this dimension.
184 // Returns:
185 // The tile size in this dimension.
186 int computePaddedTileDim(int maxTileDim,
187  int padding,
188  int weightDim,
189  int stride) {
190  // The number of strides we can take in this dimension.
191  int numStrides = (maxTileDim + padding - weightDim) / stride;
192  if (numStrides <= 0)
193  return maxTileDim;
194  int tileDim = weightDim + stride * numStrides;
195  return tileDim - padding;
196 }
197 } // namespace internal
198 
200  const TensorShape& tileShape,
201  Operator* op,
202  bool copyData) {
203  const TensorShape& inputShape = tensor->getShape();
204  int inputSize = inputShape.storageSize();
205  int tileSize = tileShape.storageSize();
206  int numTiles = std::ceil(inputSize * 1.0 / tileSize);
207  TiledTensor tiledTensor(
208  TensorShape({ 1, numTiles }, DataLayout::NC), tensor, true);
209  int remainingSize = inputSize;
210  int srcOffset = 0;
211  for (auto tileIndex = tiledTensor.startIndex(); !tileIndex.end();
212  ++tileIndex) {
213  int currentTileSize = std::min(remainingSize, tileSize);
214  TensorShape currentShape({ 1, currentTileSize },
215  DataLayout::NC,
216  tileShape.getAlignment());
217  std::string tileName = op->getName() + ":" + tensor->getName() +
218  "/tile:" + std::to_string((int)tileIndex);
219  Tensor* tile = new Tensor(tileName, currentShape);
220  tile->allocateStorage(tensor->getDataType());
221  tiledTensor.setTile(tileIndex, { srcOffset }, tile, copyData);
222  srcOffset += currentTileSize;
223  remainingSize -= currentTileSize;
224  }
225  op->getWorkspace()->addTiledTensor(tiledTensor);
226  dout(1) << " Tiled Tensor " << tensor->getName() << ":\n"
227  << " original tensor shape: " << tensor->getShape() << "\n"
228  << " tile shape " << tileShape
229  << ", number of tiles: " << tiledTensor.size() << "\n";
230  return tiledTensor;
231 }
232 
234  Tensor* tensor,
235  const TensorShape& tileShape,
236  Operator* op,
237  int fieldRows,
238  int fieldCols,
239  int rowStride,
240  int colStride,
241  PaddingType paddingType,
242  bool copyData) {
243  const TensorShape& inputShape = tensor->getShape();
244  const int ndims = inputShape.ndims();
245  DataLayout layout = inputShape.getLayout();
246  // Compute the tiling halos. These are the rows/columns that the subsequent
247  // tile will overlap with the previous tile.
248  std::vector<int> tilingHalos(ndims, 0);
249  int hIdx = layout == NHWC ? 1 : NCHW ? 2 : -1;
250  int wIdx = layout == NHWC ? 2 : NCHW ? 3 : -1;
251  // The tilingHalos could be negative if fieldRows < rowStride, but we
252  // actually want that. For example, fieldRows = 1, rowStride = 2, then what
253  // the next tile wants is not to "borrow" any rows from the previous tile,
254  // but skip one row and start from there. So, -1 actually gives us the
255  // skipping effect.
256  if (hIdx != -1 && fieldRows != 0)
257  tilingHalos[hIdx] = fieldRows - rowStride;
258  if (wIdx != -1 && fieldCols != 0)
259  tilingHalos[wIdx] = fieldCols - colStride;
260  // Compute the input paddings.
261  int totalRowPad = (paddingType == SamePadding) ? fieldRows - 1 : 0;
262  int totalColPad = (paddingType == SamePadding) ? fieldCols - 1 : 0;
263  int topPad = FRAC_CEIL(totalRowPad, 2);
264  int leftPad = FRAC_CEIL(totalColPad, 2);
265  // This contains tile shapes in each dimension.
266  std::vector<std::vector<int>> tilesInDim(ndims);
267  // Compute the tile shapes in each dimension.
268  for (int i = 0; i < ndims; i++) {
269  int remaining = inputShape[i];
270  while (remaining > 0) {
271  int tileDim = std::min(tileShape[i], remaining);
272  bool firstTileInDim = tilesInDim[i].size() == 0;
273  bool lastTileInDim = remaining <= tileShape[i];
274  // Adjust the tile dimension size if we are at the first tile
275  // because of the top/left paddings.
276  if (i == hIdx && firstTileInDim && !lastTileInDim) {
277  tileDim = internal::computePaddedTileDim(
278  tileDim, topPad, fieldRows, rowStride);
279  } else if (i == wIdx && firstTileInDim && !lastTileInDim) {
280  tileDim = internal::computePaddedTileDim(
281  tileDim, leftPad, fieldCols, colStride);
282  }
283  tilesInDim[i].push_back(tileDim);
284  remaining -= tileDim;
285  if (remaining > 0)
286  remaining += tilingHalos[i];
287  }
288  }
289  std::vector<int> numBlocksInDim(ndims, 0);
290  for (int i = 0; i < ndims; i++)
291  numBlocksInDim[i] = tilesInDim[i].size();
292  TiledTensor tiledTensor(
293  TensorShape(numBlocksInDim, inputShape.getLayout()), tensor);
294  if (tiledTensor.size() == 1) {
295  // If there's only one tile, we don't need to tile the original tensor.
296  // So directly use it as the tile.
297  tiledTensor[0] = tensor;
298  } else {
299  std::vector<int> currentOrigin(ndims, 0);
300  for (auto tileIndex = tiledTensor.startIndex(); !tileIndex.end();
301  ++tileIndex) {
302  std::vector<int> currentTileShape(ndims);
303  for (int i = 0; i < ndims; i++)
304  currentTileShape[i] = tilesInDim[i][tileIndex.currentIndex(i)];
305  TensorShape currentShape(currentTileShape,
306  tileShape.getLayout(),
307  tileShape.getAlignment());
308  std::string tileName = op->getName() + ":" + tensor->getName() +
309  "/tile:" + std::to_string((int)tileIndex);
310  Tensor* tile = new Tensor(tileName, currentShape);
311  tile->allocateStorage(tensor->getDataType());
312  tiledTensor.setTile(tileIndex, currentOrigin, tile, false);
313  for (int i = ndims - 1; i >= 0; i--) {
314  currentOrigin[i] += currentShape[i];
315  if (currentOrigin[i] >= inputShape[i]) {
316  currentOrigin[i] = 0;
317  } else {
318  currentOrigin[i] -= tilingHalos[i];
319  break;
320  }
321  }
322  }
323  }
324  if (copyData) {
325  tiledTensor.copyDataToAllTiles();
326  }
327  op->getWorkspace()->addTiledTensor(tiledTensor);
328  dout(1) << " Tiled Tensor " << tensor->getName() << ":\n"
329  << " original tensor shape: " << tensor->getShape() << "\n"
330  << " tile shape: " << tileShape
331  << ", number of tiles: " << tiledTensor.size() << "\n";
332  return tiledTensor;
333 }
334 
336  const TensorShape& tileShape,
337  Operator* op,
338  bool copyData) {
340  tensor, tileShape, op, 0, 0, 1, 1, ValidPadding, copyData);
341 }
342 
343 void flattenTiledTensor(TiledTensor& tiledTensor, Tensor* destTensor) {
344  const TensorShape& tensorShape = destTensor->getShape();
345  int ndims = tensorShape.ndims();
346  int destOffset = 0;
347  for (auto tileIndex = tiledTensor.startIndex(); !tileIndex.end();
348  ++tileIndex) {
349  Tensor* tile = tiledTensor[tileIndex];
350  const TensorShape& tileShape = tile->getShape();
352  destTensor, tile, destOffset, 0, tileShape.storageSize());
353  destOffset += tileShape.storageSize();
354  }
355 }
356 
357 Tensor* concatTensors(std::vector<Tensor*> inputTensors,
358  int concatDim,
359  Workspace* workspace) {
360  std::string outputName = inputTensors[0]->getName();
361  TensorShape inputShape = inputTensors[0]->getShape();
362  std::vector<int> outputDims = inputShape.dims();
363  // Calculate the shape for the output tensor.
364  for (int i = 1; i < inputTensors.size(); i++) {
365  outputName += ("-" + inputTensors[i]->getName());
366  outputDims[concatDim] += inputTensors[i]->getShape()[concatDim];
367  }
368  TensorShape outputShape(
369  outputDims, inputShape.getLayout(), inputShape.getAlignment());
370  Tensor* outputTensor = new Tensor(outputName, outputShape);
371  workspace->addTensor(outputTensor);
372  outputTensor->allocateStorage(inputTensors[0]->getDataType());
373  // Copy data into the output tensor.
374  int ndims = inputShape.ndims();
375  std::vector<int> currentOrigin(ndims, 0);
376  std::vector<int> srcOrigin(ndims, 0);
377  for (int i = 0; i < inputTensors.size(); i++) {
378  TensorShape srcShape = inputTensors[i]->getShape();
379  copyTensorRegion(outputTensor,
380  inputTensors[i],
381  currentOrigin,
382  srcOrigin,
383  srcShape.dims());
384  currentOrigin[concatDim] += srcShape[concatDim];
385  }
386  return outputTensor;
387 }
388 
389 } // namespace smaug
smaug::Tensor
Tensor represents a single multi-dimensional array of data.
Definition: tensor.h:344
smaug::dout
const DebugStream & dout(int debugLevel)
Returns a DebugStream instance for the given debug level.
Definition: debug_stream.cpp:16
smaug::concatTensors
Tensor * concatTensors(std::vector< Tensor * > inputTensors, int concatDim, Workspace *workspace)
Concatenates Tensors on the specified dimension into one single tensor.
Definition: tensor_utils.cpp:357
smaug::copyTensorRegion
void copyTensorRegion(Tensor *dest, Tensor *src, std::vector< int > destOrigin, std::vector< int > srcOrigin, std::vector< int > regionSize)
Copies a region of a source Tensor to a corresponding region in a destination Tensor.
Definition: tensor_utils.cpp:65
smaug::copyTensorData
void copyTensorData(Tensor *dest, Tensor *src, std::vector< int > destOrigin, std::vector< int > srcOrigin, int copySize)
Similar to copyTensorRegion, but the region is a contiguous block of memory.
Definition: tensor_utils.cpp:102
smaug::Tensor::allocateStorage
T * allocateStorage()
Allocates memory to store Tensor data.
Definition: tensor.h:473
smaug::Workspace
Workspace is the container and owner of all Tensors and Operators in the Network.
Definition: workspace.h:17
smaug::TiledTensor
A multidimensional container of Tensors.
Definition: tensor.h:552
tensor_utils.h
Utility functions for copying/printing/tiling tensors.
FRAC_CEIL
#define FRAC_CEIL(A, B)
Implements the ceiling function of A/B.
Definition: common.h:505
smaug::TiledTensor::setTile
void setTile(int index, const std::vector< int > &origin, Tensor *tensor, bool copyData)
Set the specified tile to the provided Tensor, and optionally copy data into it.
Definition: tensor.cpp:71
smaug::copyRawTensorData
void copyRawTensorData(Tensor *dest, Tensor *src, int destOffset, int srcOffset, int copySize)
Directly copies a linear region of memory from dest to src, without taking dimensions/padding into ac...
Definition: tensor_utils.cpp:138
smaug::generateTiledTensorPerBatchNC
TiledTensor generateTiledTensorPerBatchNC(Tensor *tensor, const TensorShape &tileShape, Operator *op, bool copyData)
Tile the provided NC Tensor per batch.
Definition: tensor_utils.cpp:199
smaug::TensorShape
TensorShape describes the shape of a Tensor.
Definition: tensor.h:35
smaug::generateTiledTensorWithStrideAndPadding
TiledTensor generateTiledTensorWithStrideAndPadding(Tensor *tensor, const TensorShape &tileShape, Operator *op, int fieldRows, int fieldCols, int rowStride, int colStride, PaddingType paddingType, bool copyData)
Generates a TiledTensor from a source Tensor with the specified tile shape.
Definition: tensor_utils.cpp:233
smaug::Operator
Operator is the base class for all graph operators supported by SMAUG.
Definition: operator.h:28
smaug
The smaug namespace is the parent namespace of all C++ code in SMAUG.
Definition: backend.cpp:38
smaug::generateTiledTensor
TiledTensor generateTiledTensor(Tensor *tensor, const TensorShape &tileShape, Operator *op, bool copyData)
Generates a TiledTensor from a source Tensor.
Definition: tensor_utils.cpp:335
smaug::TiledTensor::copyDataToAllTiles
void copyDataToAllTiles()
Copies data (if needed) to all the tiles from the original Tensor.
Definition: tensor.cpp:116
smaug::flattenTiledTensor
void flattenTiledTensor(TiledTensor &tiledTensor, Tensor *destTensor)
Copies the data from each tile in a TiledTensor into a destination Tensor as a contiguous block of me...
Definition: tensor_utils.cpp:343