Program Listing for File add.cu

Return to documentation for file (src/tensors/gpu/add.cu)

#include "tensors/gpu/add.h"
#include "tensors/gpu/add_all.h"

#include "tensors/gpu/cuda_helpers.h"

#include "functional/functional.h"
#include "functional/shape.h"
#include "functional/tensor.h"
#include "functional/tmp.h"

namespace marian {

namespace gpu {

template <size_t K, class Functor, class AggFunctor, typename T, typename AccType>
__global__ void gAggregateGeneric(Functor functor,                                 // functor applied to single corresponding elements in tensors (via broadcasting),
                                  AccType aggInit,                                 // aggInit is starting value of accumulation (e.g. 0 for sum),
                                  AggFunctor aggFunctor,                           // aggFunctor is used to accumulate values (e.g. sum),
                                  const functional::Shape full,                    // maximal combined shape of all tensors via broadcasting
                                  functional::Tensor<T> out,                       // output tensor
                                  functional::Array<functional::Tensor<T>, K> ins, // input tensors
                                  AccType scale = 1.0) {                           // scale accumulation result by scale. e.g. used for computing mean from sum over N elements with scale 1/N
  int outLength = out.shape().elements();
  bool same = outLength == full.elements();
  for(int i = 0; i < K; ++i)
    same = same && outLength == ins[i].shape().elements();

  constexpr size_t N = functional::Shape::size();
  functional::Array<int, N> len;
  for(int i = 0; i < N; ++i)
    len[i] = full[i] / out.shape()[i];

  functional::Array<int, N> dims;
  for(int bid = 0; bid < outLength; bid += blockDim.x * gridDim.x) {
    int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
    if(index < outLength) {
      if(same) {
        out[index] = (T)aggFunctor((AccType)out[index], functional::applyWithCast<AccType>(functor, ins, index) * scale); // apply functors to with arguments cast to AccType
      } else {
        out.shape().dims(index, dims);
        out[index] = (T)aggFunctor((AccType)out[index], functional::loops(functor, aggInit, aggFunctor, ins, len, dims) * scale); // apply functors to with arguments cast to AccType
      }
    }
  }
}

template <size_t K, class Functor, class AggFunctor, typename T, typename AccType>
__global__ void gAggregateEqual(Functor functor, AggFunctor aggFunctor,
                                functional::Tensor<T> out,
                                functional::Array<functional::Tensor<T>, K> ins,
                                AccType scale,
                                bool broadcast) {
  int length = out.shape().elements();
  functional::Array<int, functional::Shape::size()> dims;

  for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) {
    int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
    if(index < length) {
      functional::Array<int, K> indices;
      indices.fill(index);

      if(broadcast) {
        out.shape().dims(index, dims);
        for(size_t i = 0; i < K; ++i)
          indices[i] = ins[i].shape().bindex(dims);
      }

      out[index] = (T)aggFunctor((AccType)out[index], functional::applyWithCast<AccType>(functor, ins, indices) * scale);
    }
  }
}

template <size_t K, class Functor, class AggFunctor, typename T, typename AccType = float>
__global__ void gAggregateReduce(Functor functor, AccType aggInit, AggFunctor aggFunctor,
                                 const functional::Shape full,
                                 functional::Tensor<T> out,
                                 functional::Array<functional::Tensor<T>, K> ins,
                                 AccType scale = 1.0) {
  int rows = full.elements() / full.back();
  int cols = full.back();

  bool same = true; // do all inputs have the same number of elements?
  for(int i = 0; i < K; ++i)
    same = same && ins[i].shape().elements() == full.elements();

  for(int bid = 0; bid < rows; bid += gridDim.x) {
    int j = bid + blockIdx.x;
    if(j < rows) {
      // make sure shared memory is the same for different types
      // by using bytes instead of type T
      extern __shared__ uint8_t _sharedBytes[];
      AccType* _sum = (AccType*)_sharedBytes;

      if(same) {
        _sum[threadIdx.x] = aggInit;
        for(int tid = 0; tid < cols; tid += blockDim.x) {
          int id = tid + threadIdx.x;
          if(id < cols)
            _sum[threadIdx.x] = aggFunctor(_sum[threadIdx.x], functional::applyWithCast<AccType>(functor, ins, j * cols + id)); // casts to AccType before applying functor which then performs operation in AccType
        }
      } else {
        functional::Array<int, functional::Shape::size()> dims;
        _sum[threadIdx.x] = aggInit;

        for(int tid = 0; tid < cols; tid += blockDim.x) {
          int id = tid + threadIdx.x;
          if(id < cols) {
            full.dims(j * cols + id, dims);
            functional::Array<int, K> indices;
            for(int i = 0; i < K; ++i)
              indices[i] = ins[i].shape().bindex(dims);
            _sum[threadIdx.x] = aggFunctor(_sum[threadIdx.x], functional::applyWithCast<AccType>(functor, ins, indices));// casts to AccType before applying functor which then performs operation in AccType
          }
        }
      }
      __syncthreads();
      int len = blockDim.x;
      while(len != 1) {
        __syncthreads();
        int skip = (len + 1) >> 1;
        if(threadIdx.x < (len >> 1)) {
          _sum[threadIdx.x] = aggFunctor(_sum[threadIdx.x], _sum[threadIdx.x + skip]);
        }
        len = (len + 1) >> 1;
      }
      __syncthreads();
      if(threadIdx.x == 0) // only set value when in thread 0 in block
        out[j] = aggFunctor(out[j], (T)(_sum[0] * scale));
    }
    __syncthreads();
  }
}

template <typename T, typename AccType, class Functor, class AggFunctor, class... Tensors>
void AggregateTyped(Functor functor, AccType aggInit, AggFunctor aggFunctor, AccType scale, marian::Tensor out, Tensors... tensors) {
  cudaSetDevice(out->getDeviceId().no);

  auto full = marian::Shape::broadcast({out, tensors...});

  int length = out->shape().elements();

  constexpr size_t K = sizeof...(Tensors);

  functional::Tensor<T> gOut = out;
  functional::Array<functional::Tensor<T>, K> gIns = {tensors...};

  if(out->shape().elements() == 1) { // reduce everything into a single element
    AggregateAll<T, AccType>(nullptr, functor, aggInit, aggFunctor, scale, out, tensors...); // @TODO: pass allocator in here, currently uses cudaMalloc
  } else if(full.back() != 1 && out->shape().back() == 1 && full.elements() / full.back() == length) { // element number of out and full shape on axis that are not reduced must match
    size_t m = full.elements() / full.back(); // how many rows are we iterating over?
    size_t k = full.back();                   // how many columns are being reduced to 1 in each row?

    int blocks  = std::min(MAX_BLOCKS,  (int)m);
    int threads = std::min(MAX_THREADS, (int)k);
    int shared  = sizeof(AccType) * threads;
    gAggregateReduce<K, Functor, AggFunctor, T, AccType><<<blocks, threads, shared>>>(functor, aggInit, aggFunctor, full, gOut, gIns, scale);
  } else if(out->shape() == full) {
    int threads = std::min(MAX_THREADS, length);
    int blocks  = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));

    bool broadcast = false;
    for(int i = 0; i < K; ++i)
      broadcast = broadcast || gOut.shape() != gIns[i].shape();
    gAggregateEqual<<<blocks, threads>>>(functor, aggFunctor, gOut, gIns, scale, broadcast);
  } else {
    int threads = std::min(MAX_THREADS, length);
    int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));

    gAggregateGeneric<<<blocks, threads>>>(functor, aggInit, aggFunctor, full, gOut, gIns, scale);
  }
}

template <class Functor, class AggFunctor, class... Tensors>
void Aggregate(Functor functor, float aggInit, AggFunctor aggFunctor, float scale, marian::Tensor out, Tensors... tensors) {
  if(out->type() == Type::float32) {
    AggregateTyped<float, float>(functor, aggInit, aggFunctor, scale, out, tensors...);
  } else if(out->type() == Type::float16) {
#if COMPILE_FP16
    AggregateTyped<half,  float>(functor, aggInit, aggFunctor, scale, out, tensors...);
#else
     ABORT("FP16 not supported with current hardware or CUDA version");
#endif
  } else {
    ABORT("Type {} not yet supported", out->type());
  }
}

template <class Functor, class... Tensors>
void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) {
  auto addFunctor = functional::_1 + functional::_2;
  Aggregate(functor, 0.f, addFunctor, scale, out, tensors...);
}

#include "tensors/gpu/add.inc"
}  // namespace gpu
}  // namespace marian