.. _program_listing_file_src_tensors_gpu_add_all.h: Program Listing for File add_all.h ================================== |exhale_lsh| :ref:`Return to documentation for file ` (``src/tensors/gpu/add_all.h``) .. |exhale_lsh| unicode:: U+021B0 .. UPWARDS ARROW WITH TIP LEFTWARDS .. code-block:: cpp #pragma once // This header file provides wrappers around NVidia's reduce_all kernel with our custom aggregation functionality // This kernel reduces a tensor into a single value. We have modified it to allow for different types of aggregations // like summing or max etc. #include "tensors/gpu/cuda_helpers.h" #include "tensors/tensor.h" #include "tensors/allocator.h" #include "functional/tensor.h" #include "tensors/tensor_operators.h" namespace marian { // These function declarations are repeated as template specialization with variadic template arguments does not seem to work. // Here I am just creating version for 1, 2, and 3 arguments. To be extended if required. template void AggregateAll(Ptr allocator, Functor functor, AccType aggInit, AggFunctor aggFunctor, AccType scale, Tensor out, const Tensor in1); template void AggregateAll(Ptr allocator, Functor functor, AccType aggInit, AggFunctor aggFunctor, AccType scale, Tensor out, const Tensor in1, const Tensor in2); template void AggregateAll(Ptr allocator, Functor functor, AccType aggInit, AggFunctor aggFunctor, AccType scale, Tensor out, const Tensor in1, const Tensor in2, const Tensor in3); // Aggregates all values into a single tensor and returns the value of that tensor as a float // This does a GPU to CPU memory copy via TensorBase::scalar(). // Used currently only for L2Norm computation template AccType AggregateAllAndReturn(Ptr allocator, Functor functor, AccType aggInit, AggFunctor aggFunctor, AccType scale, const Tensors... tensors) { MemoryPiece::PtrType temporaryMemory; if(allocator) { temporaryMemory = allocator->alloc(1); } else { // @TODO: get rid of this branch uint8_t* temporaryMemoryPtr = 0; CUDA_CHECK(cudaMalloc(&temporaryMemoryPtr, sizeof(AccType))); temporaryMemory = MemoryPiece::New(temporaryMemoryPtr, sizeof(AccType)); } std::tuple in(tensors...); // Create a temporary tensor of size 1 to reduce into auto out = TensorBase::New(temporaryMemory, Shape({1}), typeId(), std::get<0>(in)->getBackend()); out->set(aggInit); // init to aggInit AggregateAll(allocator, functor, aggInit, aggFunctor, scale, out, tensors...); AccType outScalar = out->template scalar(); // convert to float also if other underlying type if(allocator) allocator->free(out->memory()); else if(out->memory()->data()) // @TODO: get rid of this branch CUDA_CHECK(cudaFree(out->memory()->data())); return outScalar; } }