Skip to content

Latest commit



698 lines (565 loc) · 18.9 KB

File metadata and controls

698 lines (565 loc) · 18.9 KB


./tensor.h consists of header file of tensor data structure and functions

  • CPU and GPU
  • Shape
  • Stream
  • TRValue :public expr::RValueExp<Container, DType>
  • Tensor :public TRValue<Tensor<Device, dimension, DType>, Device, dimension, DType>
  • Tensor<Device, 1, DType> :public TRValue<Tensor<Device, 1, DType>, Device, 1, DType>


The struct cpu and gpu here is to guarantee the success of checkings before final evaluation.

// device name CPU
struct cpu {
  // whether this device is CPU or not
  static const bool kDevCPU = true;
  // device flag number, identifies this device
  static const int kDevMask = 1 << 0;
// device name GPU
struct gpu {
  // whether this device is CPU or not 
  static const bool kDevCPU = false;
  // device flag number, identifies this device 
  static const int kDevMask = 1 << 1;


The struct Shape here is to provide a flexible component for the construction of class Tensor.

Member variables

The kDimension defines the dimension of current shape, while the kSubdim can be used to infer the lowest dimension in the Tensor.

static const int kDimension = dimension;
static const int kSubdim = dimension - 1;

With the information of kDimension, we can construct an unsigned array shape_ to store the shape of Tensor.

// typedef unsigned index_t;
index_t shape_[kDimension]; 


After the difinition of three member variables, we can write the constructor to create a struct Shape.

// default constructor, do nothing  
MSHADOW_XINLINE Shape(void) {}
// constuctor
MSHADOW_XINLINE Shape(const Shape<kDimension> &s) {
  #pragma unroll
  for (int i = 0; i < kDimension; ++i) {
	this->shape_[i] = s[i];

The code const Shape<kDimension> &s as a variable of constructor makes sure that only struct Shape with same kDimension can be allowed to initiate the new ones.

Overloaded Operator

Operator [] is overloaded to return the sub-dimension of required index.

MSHADOW_XINLINE index_t &operator[](index_t idx) {
  return shape_[idx];
// the returned value is constant
MSHADOW_XINLINE const index_t &operator[](index_t idx) const {
  return shape_[idx];

Operator == and != are overloaded to check whether two struct Shape are same.

// Shape<kDimension> implicitly check whether the input variable has the same size
MSHADOW_XINLINE bool operator==(const Shape<kDimension> &s) const {
  #pragma unroll
  for (int i = 0; i < kDimension; ++i) {
    if (s.shape_[i] != this->shape_[i]) return false;
  return true;
// return whether two shape not equal
// s:         the shape to compare against
MSHADOW_XINLINE bool operator!=(const Shape<kDimension> &s) const {
  return !(*this == s);

Operator << is overloaded to output the shape_ of Shape.

template<int dim>
  friend std::ostream &operator<<(std::ostream &os, const Shape<dim> &shape);

Above is only a declaration, while its definition is in ./tensor_cpu-inl.h.

template<int ndim>
inline std::ostream &operator<<(std::ostream &os, const Shape<ndim> &shape) {
  os << '(';
  for (int i = 0; i < ndim; ++i) {
    if (i != 0) os << ',';
    os << shape[i];
  // python style tuple
  if (ndim == 1) os << ',';
  os << ')';
  return os;

Member Functions


The size function returns the product of all sub-dimensions. e.g. (5,3,6) -> 90

MSHADOW_XINLINE size_t Size(void) const {
  size_t size = this->shape_[0];
  #pragma unroll
  for (int i = 1; i < kDimension; ++i) {
    size *= this->shape_[i];
  return size;


The FlatTo1D function returns a Shape with kDimension=1 and its dimension equals to the product of original Shape. e.g. (5,3,6) -> (90)

MSHADOW_XINLINE Shape<1> FlatTo1D(void) const {
  Shape<1> s;
  s[0] = this->Size();
  return s;


The FlatTo2D function returns a Shape with kDimension=2 and its first dimension equals to the product of original Shape instead of the lowest dimension, which is put into its second dimension.

the reason of doing so will be considered and explained later
MSHADOW_XINLINE Shape<2> FlatTo2D(void) const {
  Shape<2> s;
  s.shape_[1] = this->shape_[kDimension - 1];
  index_t ymax = 1;
  #pragma unroll
  for (int i = 0; i < kDimension - 1; ++i) {
    ymax *= this->shape_[i];
  s.shape_[0] = ymax;
  return s;


The function ProdShape returns the product of shape in range [dimstart, dimend).

MSHADOW_XINLINE index_t ProdShape(int dimstart, int dimend) const {
  index_t num = 1;
  #pragma unroll
  for (int i = dimstart; i < dimend; ++i) {
    num *= this->shape_[i];
  return num;


The function SubShape return a new shape, whose kDimension is the minus 1 of original one. e.g. (3,2,6,4) -> (2,6,4)

Since it is majorly built for cuda, its effectiveness will be considered and explained later
MSHADOW_XINLINE Shape<kSubdim> SubShape(void) const {
  Shape<kSubdim> s;
  // for cuda
  #pragma unroll
  for (int i = 0; i < kSubdim; ++i) {
    s.shape_[i] = this->shape_[i + 1];
  return s;


The function Slice return a new shape, whose kDimension is the dimend-dimstart of original one.

template<int dimstart, int dimend>
MSHADOW_XINLINE Shape<dimend - dimstart> Slice(void) const {
  Shape<dimend - dimstart> s;
  #pragma unroll
  for (int i = dimstart; i < dimend; ++i) {
    s[i - dimstart] = this->shape_[i];
  return s;

It can used in the following way, e.g. (3,4,5,6,7) -> (5,6,7).

// usage
Shape<5> ss = Shape5(3,4,5,6,7);
Shape<3> sss = ss.Slice<2,5>();
cout << sss <<endl;
// output (5,6,7)

Useful Construction

According to the usage instruction above, we introduce several construction functions for struct Shape as APIs.

// useful construction functions to generate shape
MSHADOW_XINLINE Shape<1> Shape1(index_t s0) {
  Shape<1> s; s[0] = s0;
  return s;

MSHADOW_XINLINE Shape<2> Shape2(index_t s0, index_t s1) {
  Shape<2> s; s[0] = s0; s[1] = s1;
  return s;

MSHADOW_XINLINE Shape<3> Shape3(index_t s0, index_t s1, index_t s2) {
  Shape<3> s;
  s[0] = s0; s[1] = s1; s[2] = s2;
  return s;

MSHADOW_XINLINE Shape<4> Shape4(index_t s0, index_t s1,
                                index_t s2, index_t s3) {
  Shape<4> s;
  s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
  return s;

MSHADOW_XINLINE Shape<5> Shape5(index_t s0, index_t s1, index_t s2,
                                index_t s3, index_t s4) {
  Shape<5> s;
  s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; s[4] = s4;
  return s;


The Stream here is only a dummy implementation for CPU, we left it for further discussion when we run into the implementation of GPU.

template<typename Device>
struct Stream {
  // this is only a dummy implementation for CPU
  // for GPU, the actual implementation will be specialized in tensor_gpu-inl.h
  //wait for all the computation associated with this stream to complete
  inline void Wait(void) {}

  // query whether the the stream is idle
  // return true if the stream is idle and all the job have been completed
  inline bool CheckIdle(void) {
    return true;

  // create a blas handle
  inline void CreateBlasHandle() {}


:public expr::RValueExp<Container, DType>

This is Tensor RValue, which is also the super type of all kinds of possible tensors.

The meaning of its existence is essential for the understanding of MShadow framework, thus 
we postpone its discussion into the core part of this tutorial
template<typename Container, typename Device, int dimension, typename DType>
struct TRValue: public expr::RValueExp<Container, DType> {


:public TRValue<Tensor<Device, dimension, DType>, Device, dimension, DType>

The struct Tensor is the key element in MShadow.

//in /mshadow/base.h

//#define MSHADOW_DEFAULT_DTYPE = default_real_t

//typedef float default_real_t;
//as a result, the default data type of Tensor is float
template<typename Device, int dimension, typename DType MSHADOW_DEFAULT_DTYPE>
struct Tensor: public TRValue<Tensor<Device, dimension, DType>, Device, dimension, DType> {
// Trival Usage
Tensor<cpu, 3> ts(data, Shape3(2,5,2));

Member Variables

The variable kDevCPU indicates in which type of device the data are stored. And the kSubdim is same as in the struct Shape.

static const bool kDevCPU = Device::kDevCPU;
static const int  kSubdim = dimension - 1;

The pointer dptr_ points to the data wherever it is stored. Besides, the shape of data is controlled by the struct Shape to make it flexible to handle.

DType *dptr_;
Shape<dimension> shape_;

At last, the stride_ variable is used to deal with pitch allocation in gpu or sse (align x dimension to 64bit) for efficiency.

the concept of stream is important for GPU devices, we leave the discussion to there.
index_t stride_;
// stream where the computation lies
// stream is a device dependency concept
Stream<Device> *stream_;


It is worth noting that the `stride_` is default initialized to be the lowest dimension of 
`Shape`. The reason of doing it is remained to be examined and discussed.
// default constructor
MSHADOW_XINLINE Tensor(void) : stream_(NULL) {}

// constructor from shape
MSHADOW_XINLINE Tensor(const Shape<dimension> &shape): shape_(shape), stream_(NULL) {}

// constructor from data pointer and shape, without stride
MSHADOW_XINLINE Tensor(DType *dptr, const Shape<dimension> &shape)
	: dptr_(dptr), shape_(shape), stride_(shape[kSubdim]), stream_(NULL) {}

// constructor from data pointer, shape and stream, without stride
MSHADOW_XINLINE Tensor(DType *dptr, const Shape<dimension> &shape,Stream<Device> *stream)
    : dptr_(dptr), shape_(shape), stride_(shape[kSubdim]), stream_(stream) {}

// constructor from data pointer, shape, stride and stream
MSHADOW_XINLINE Tensor(DType *dptr, const Shape<dimension> &shape, index_t stride, Stream<Device> *stream)
    : dptr_(dptr), shape_(shape), stride_(stride), stream_(stream) {}

Member Functions

MSize and MemSize

The function MSize returns the memory cost of specified tensor, including the aligned x dimension (so it starts with the value of largest dimension of tensor). While the function MemSize returns the memory starting from the startdim.

MSHADOW_XINLINE size_t MSize(void) const {
  return this->MemSize<0>();

template<int startdim>
MSHADOW_XINLINE size_t MemSize(void) const {
  size_t memsz = this->stride_;
  #pragma unroll
  for (int i = startdim; i < kSubdim; ++i) {
    memsz *= this->shape_[i];
  return memsz;


The function size return the shape of the specified sub-dimension.

MSHADOW_XINLINE index_t size(index_t idx) const {
  return shape_[idx];

FlatTo1D and FlatTo2D

The functions FlatTo1D and FlatTo2D return a new tensor with same data (unchanged dptr_), but different shape (refer to FlatTo1D and FlatTo2D in the context of Shape).

MSHADOW_XINLINE Tensor<Device, 1, DType> FlatTo1D(void) const {
  return Tensor<Device, 1, DType>(dptr_, shape_.FlatTo1D(), stride_, stream_);

MSHADOW_XINLINE Tensor<Device, 2, DType> FlatTo2D(void) const {
  return Tensor<Device, 2, DType>(dptr_, shape_.FlatTo2D(), stride_, stream_);


The function Slice returns a new Tensor, which is a subset of the highest dimension. e.g. (128,3,224,224) -> (64,3,224,224)

MSHADOW_XINLINE Tensor<Device, dimension, DType>
Slice(index_t begin, index_t end) const {
  Shape<dimension> s = this->shape_;
  s[0] = end - begin;
  return Tensor<Device, dimension, DType>(dptr_ + this->MemSize<1>() * begin, s, stride_, stream_);

Overloaded Operators

Operator []

The operator [] is overloaded to return the corresponding index in the highest dimension of Tensor.

The code dptr_ + this->MemSize<1>() * idx is to fetch the idx sub-tensor in Tensor, e.g. (128,3,224,224)[5] -> 5-th (3,224,224)

MSHADOW_XINLINE Tensor<Device, kSubdim, DType> operator[](index_t idx) const {
  return Tensor<Device, kSubdim, DType>(dptr_ + this->MemSize<1>() * idx, shape_.SubShape(), stride_, stream_);

Operator =

The operator = is overloaded to be assignment operator when the rhs (right hand side) is also a Tensor variable.

// implement the assignment of same type
inline Tensor<Device, dimension, DType> &
operator=(const Tensor<Device, dimension, DType> &exp) {
  dptr_ = exp.dptr_;
  shape_ = exp.shape_;
  stride_ = exp.stride_;
  stream_ = exp.stream_;
  return *this;

However, if the rhs is a scalar, e.g. 3.0f, or a Exp (expression) type, the operator = is overloaded to trigger the computation, which calls the __assign function, defined in its grandfather class RValueExp.

// we trigger computation at here
template<typename E, int etype>
inline Tensor<Device, dimension, DType> &
operator=(const expr::Exp<E, DType, etype> &exp) {
  return this->__assign(exp);

inline Tensor<Device, dimension, DType> &
operator=(const DType &exp) {
  return this->__assign(exp);

It is worth noting that there are several other assignment related operators are overloaded, but in the grandfather class RValueExp. We will do analysis until reaching there.

Missing Explanations

the usage of #pragma unroll:

if the following for loop has a constant number of loops, the for loop will be expanded in the compile time to accelerate the process. e.g. for(i = 1; i < 10; i++){...}; will be expanded

Otherwise, if the number of loop is undetermined, it will keep itself same as common loop e.g. for(i = 1; i < n; i++){...}; will be the same. Since computer will not be able to know the exact value of n until the computation time.

Missing Component


ConvertLayout is left to the discussion of MxNet, which uses it to do convolution.

// Convert shape in src_layout to shape in dst_layout
inline Shape<4> ConvertLayout(const Shape<4>& src, int src_layout, int dst_layout) {
  Shape<4> dst;
  switch (src_layout) {
  case kNCHW:
    dst = src;
  case kNHWC:
    dst[0] = src[0];
    dst[2] = src[1];
    dst[3] = src[2];
    dst[1] = src[3];
    LOG(FATAL) << "Invalid layout for 4d shape " << src_layout;
  Shape<4> dst2;
  switch (dst_layout) {
  case kNCHW:
    return dst;
  case kNHWC:
    dst2[0] = dst[0];
    dst2[1] = dst[2];
    dst2[2] = dst[3];
    dst2[3] = dst[1];
    LOG(FATAL) << "Invalid layout for 4d shape " << src_layout;
  return dst2;
// Convert shape in src_layout to shape in dst_layout
inline Shape<5> ConvertLayout(const Shape<5>& src, int src_layout, int dst_layout) {
  Shape<5> dst;
  switch (src_layout) {
  case kNCDHW:
    dst = src;
  case kNDHWC:
    dst[0] = src[0];
    dst[2] = src[1];
    dst[3] = src[2];
    dst[4] = src[3];
    dst[1] = src[4];
    LOG(FATAL) << "Invalid layout for 5d shape " << src_layout;
  Shape<5> dst2;
  switch (dst_layout) {
  case kNCDHW:
    return dst;
  case kNDHWC:
    dst2[0] = dst[0];
    dst2[1] = dst[2];
    dst2[2] = dst[3];
    dst2[3] = dst[4];
    dst2[4] = dst[1];
    LOG(FATAL) << "Invalid layout for 5d shape " << src_layout;
  return dst2;

Tensor:set_stream and Tensor::CheckContiguous

These two functions are heavily related to GPU implementation, so they are left for further discussion.

inline void set_stream(Stream<Device> *stream) {
  this->stream_ = stream;

MSHADOW_XINLINE bool CheckContiguous(void) const {
  return this->shape_[dimension - 1] == stride_;

Tensor<Device, 1, DType>

We must respecialize struct Tensor in the 1D situation, since the implementation of overloaded operator [] is different.

It can also be considered as a review of member variables and functions in original Tensor.

template<typename Device, typename DType>
struct Tensor<Device, 1, DType>:
      public TRValue<Tensor<Device, 1, DType>, Device, 1, DType> {
  DType *dptr_;
  Shape<1> shape_;
  index_t stride_;
  Stream<Device> *stream_;
  // constructor
  MSHADOW_XINLINE Tensor(void) : stream_(NULL) {}
  MSHADOW_XINLINE Tensor(const Shape<1> &shape)
      : shape_(shape), stream_(NULL) {}
  MSHADOW_XINLINE Tensor(DType *dptr, Shape<1> shape)
      : dptr_(dptr), shape_(shape), stride_(shape[0]), stream_(NULL) {}
  MSHADOW_XINLINE Tensor(DType *dptr, Shape<1> shape, Stream<Device> *stream)
      : dptr_(dptr), shape_(shape), stride_(shape[0]), stream_(stream) {}
  MSHADOW_XINLINE Tensor(DType *dptr, Shape<1> shape,
                         index_t stride, Stream<Device> *stream)
      : dptr_(dptr), shape_(shape), stride_(stride), stream_(stream) {}
  inline void set_stream(Stream<Device> *stream) {
    this->stream_ = stream;
  MSHADOW_XINLINE Tensor<Device, 1, DType> FlatTo1D(void) const {
    return *this;
  MSHADOW_XINLINE Tensor<Device, 2, DType> FlatTo2D(void) const {
    return Tensor<Device, 2, DType>(dptr_, shape_.FlatTo2D(), stride_, stream_);
  MSHADOW_XINLINE Tensor<Device, 1, DType> Slice(index_t begin, index_t end) const {
    Shape<1> s;
    s[0] = end  - begin;
    return Tensor<Device, 1, DType>(dptr_ + begin, s, s[0], stream_);
  MSHADOW_XINLINE bool CheckContiguous(void) const {
    return true;
  MSHADOW_XINLINE size_t MSize(void) const {
    return shape_[0];
  MSHADOW_XINLINE index_t size(index_t i) const {
    return shape_[0];
  MSHADOW_XINLINE DType &operator[](index_t idx) {
    return dptr_[idx];
  MSHADOW_XINLINE const DType &operator[](index_t idx) const {
    return dptr_[idx];
  // implement the assignment of same type
  inline Tensor<Device, 1, DType> &
  operator=(const Tensor<Device, 1, DType> &exp) {
    dptr_ = exp.dptr_;
    shape_ = exp.shape_;
    stride_ = exp.stride_;
    stream_ = exp.stream_;
    return *this;
  template<typename E, int etype>
  inline Tensor<Device, 1, DType> &
  operator=(const expr::Exp<E, DType, etype> &exp) {
    return this->__assign(exp);
  inline Tensor<Device, 1, DType> &operator=(const DType &exp) {
    return this->__assign(exp);