Program Listing for File add_all.cu

Return to documentation for file (src/tensors/gpu/add_all.cu)

#include "tensors/gpu/add_all.h"
#include "tensors/gpu/cuda_helpers.h"
#include "functional/functional.h"
#include "tensors/tensor_operators.h"
#include "3rd_party/reduce_all.h" // only works with CUDA >9.0, we are dropping CUDA 8.0 support, also changed in CMakeLists.txt

namespace marian {

#if COMPILE_FP16
// local overload to determine tensor type
template <> inline Type typeId<half>()  { return Type::float16; }
#endif

// Version with variadic template arguments, called by version with explicit arguments below
template <typename T, typename AccType, class Functor, class AggFunctor, class... Tensors>
void AggregateAllVar(Ptr<Allocator> allocator,
                     Functor functor,
                     AccType aggInit,
                     AggFunctor aggFunctor,
                     AccType scale,
                     marian::Tensor out,
                     const Tensors... tensors) {
  cudaSetDevice(out->getDeviceId().no);

  static_assert(CUDA_VERSION >= 9000, "Marian requires CUDA_VERSION >= 9000 (9.0)");

  constexpr size_t K = sizeof...(Tensors);                         // obtain arity K of tensors...
  functional::Array<functional::Tensor<T>, K> gIns = {tensors...}; // convert to array of K objects of type functional::Tensor<T>
  functional::Shape full = marian::Shape::broadcast({tensors...}); // compute maximal broadcasted shape

  int size = full.elements();
  int threads = (size < MAX_THREADS * 2) ? nextPow2((size + 1) / 2) : MAX_THREADS; // suggested in NVidia example for the all_reduce kernel
  int blocks  = std::min(MAX_BLOCKS, (size + (threads * 2 - 1)) / (threads * 2));  // suggested in NVidia example for the all_reduce kernel

  // The all_reduce kernel by nivida needs to perform multiple passes if the number of blocks needed to perform the reduction is larger than 1.
  // Here we allocate the memory for the intermediate reductions for each block.
  marian::Tensor blockMem;
  if(blocks > 1 || out->type() != typeId<AccType>()) { // if the out tensor does not have elementType AccType we need to allocate and convert later
    MemoryPiece::PtrType temporaryMemory;
    if(allocator) {
      temporaryMemory = allocator->alloc<AccType>(blocks);
    } else { // @TODO: get rid of this branch
      uint8_t* temporaryMemoryPtr = 0;
      CUDA_CHECK(cudaMalloc(&temporaryMemoryPtr, sizeof(AccType) * blocks));
      temporaryMemory = MemoryPiece::New(temporaryMemoryPtr, sizeof(AccType) * blocks); // @TODO: consider implementing MemoryPiece::cudaMalloc<T>(size) for managed memory
    }
    blockMem = TensorBase::New(temporaryMemory,
                               marian::Shape({blocks}),
                               typeId<AccType>(),
                               out->getBackend());
    blockMem->set(aggInit); // set temporary memory to aggInit
  }
  else {            // we are reducing into a single element now and the type matches, just use out as memory
    blockMem = out; // do not set final output memory as we might be summing gradients... needs to be handled outside this function
  }

  functional::Tensor<AccType> gBlockMem = blockMem;
  reduceSinglePass<T, AccType>(functor, aggInit, aggFunctor, scale, full, /*out=*/gBlockMem, /*in=*/gIns, threads, blocks);  // First pass reduction into intermediate memory

  // If we actually needed more than one block to perform the first pass reduction, recursively run a second pass reduction over block memory until block memory has size 1.
  if(blocks > 1) {
    using namespace functional;
    auto identity = _1; // transformation was done in first pass, hence only identity
    AggregateAll<AccType, AccType>(allocator, identity, aggInit, aggFunctor, scale, out, /*in=*/blockMem); // Reducing AccType in AccType now (meta-reduction)
  } else if(out->type() != typeId<AccType>()) { // it's only a single block, but we need to convert to different type, as mentioned above
    CopyCast(out, blockMem);
  }

  if(blockMem != out) {
    // Free temporary memory whether allocated in allocator or via cudaMalloc
    if(allocator)
      allocator->free(blockMem->memory());
    else if(blockMem->memory()->data())
      CUDA_CHECK(cudaFree(blockMem->memory()->data()));
  }
}

template <typename T, typename AccType, class Functor, class AggFunctor>
void AggregateAll(Ptr<Allocator> allocator,
                  Functor functor,
                  AccType aggInit,
                  AggFunctor aggFunctor,
                  AccType scale,
                  marian::Tensor out,
                  const marian::Tensor in1) {
  AggregateAllVar<T, AccType>(allocator, functor, aggInit, aggFunctor, scale, out, in1);
}

template <typename T, typename AccType, class Functor, class AggFunctor>
void AggregateAll(Ptr<Allocator> allocator,
                  Functor functor,
                  AccType aggInit,
                  AggFunctor aggFunctor,
                  AccType scale,
                  marian::Tensor out,
                  const marian::Tensor in1,
                  const marian::Tensor in2) {
  AggregateAllVar<T, AccType>(allocator, functor, aggInit, aggFunctor, scale, out, in1, in2);
}

template <typename T, typename AccType, class Functor, class AggFunctor>
void AggregateAll(Ptr<Allocator> allocator,
                  Functor functor,
                  AccType aggInit,
                  AggFunctor aggFunctor,
                  AccType scale,
                  marian::Tensor out,
                  const marian::Tensor in1,
                  const marian::Tensor in2,
                  const marian::Tensor in3) {
  AggregateAllVar<T, AccType>(allocator, functor, aggInit, aggFunctor, scale, out, in1, in2, in3);
}

#include "tensors/gpu/add_all.inc"

}