Program Listing for File add_all.h¶

↰ Return to documentation for file (src/tensors/gpu/add_all.h)
#pragma once

// This header file provides wrappers around NVidia's reduce_all kernel with our custom aggregation functionality
// This kernel reduces a tensor into a single value. We have modified it to allow for different types of aggregations
// like summing or max etc.

#include "tensors/gpu/cuda_helpers.h"
#include "tensors/tensor.h"
#include "tensors/allocator.h"
#include "functional/tensor.h"
#include "tensors/tensor_operators.h"

namespace marian {

// These function declarations are repeated as template specialization with variadic template arguments does not seem to work.
// Here I am just creating version for 1, 2, and 3 arguments. To be extended if required.
template <typename T, typename AccType, class Functor, class AggFunctor>
void AggregateAll(Ptr<Allocator> allocator,
                  Functor functor,
                  AccType aggInit,
                  AggFunctor aggFunctor,
                  AccType scale,
                  Tensor out,
                  const Tensor in1);

template <typename T, typename AccType, class Functor, class AggFunctor>
void AggregateAll(Ptr<Allocator> allocator,
                  Functor functor,
                  AccType aggInit,
                  AggFunctor aggFunctor,
                  AccType scale,
                  Tensor out,
                  const Tensor in1,
                  const Tensor in2);

template <typename T, typename AccType, class Functor, class AggFunctor>
void AggregateAll(Ptr<Allocator> allocator,
                  Functor functor,
                  AccType aggInit,
                  AggFunctor aggFunctor,
                  AccType scale,
                  Tensor out,
                  const Tensor in1,
                  const Tensor in2,
                  const Tensor in3);

// Aggregates all values into a single tensor and returns the value of that tensor as a float
// This does a GPU to CPU memory copy via TensorBase::scalar().
// Used currently only for L2Norm computation
template <typename T, typename AccType, class Functor, class AggFunctor, class... Tensors>
AccType AggregateAllAndReturn(Ptr<Allocator> allocator,
                              Functor functor,
                              AccType aggInit,
                              AggFunctor aggFunctor,
                              AccType scale,
                              const Tensors... tensors) {
  MemoryPiece::PtrType temporaryMemory;
  if(allocator) {
    temporaryMemory = allocator->alloc<AccType>(1);
  } else { // @TODO: get rid of this branch
    uint8_t* temporaryMemoryPtr = 0;
    CUDA_CHECK(cudaMalloc(&temporaryMemoryPtr, sizeof(AccType)));
    temporaryMemory = MemoryPiece::New(temporaryMemoryPtr, sizeof(AccType));
  }

  std::tuple<Tensors...> in(tensors...);

  // Create a temporary tensor of size 1 to reduce into
  auto out = TensorBase::New(temporaryMemory,
                             Shape({1}),
                             typeId<AccType>(),
                             std::get<0>(in)->getBackend());
  out->set(aggInit); // init to aggInit

  AggregateAll<T, AccType>(allocator, functor, aggInit, aggFunctor, scale, out, tensors...);

  AccType outScalar = out->template scalar<AccType>(); // convert to float also if other underlying type

  if(allocator)
    allocator->free(out->memory());
  else if(out->memory()->data()) // @TODO: get rid of this branch
    CUDA_CHECK(cudaFree(out->memory()->data()));

  return outScalar;
}

}