.. _program_listing_file_src_tensors_gpu_add.cu: Program Listing for File add.cu =============================== |exhale_lsh| :ref:`Return to documentation for file ` (``src/tensors/gpu/add.cu``) .. |exhale_lsh| unicode:: U+021B0 .. UPWARDS ARROW WITH TIP LEFTWARDS .. code-block:: cpp #include "tensors/gpu/add.h" #include "tensors/gpu/add_all.h" #include "tensors/gpu/cuda_helpers.h" #include "functional/functional.h" #include "functional/shape.h" #include "functional/tensor.h" #include "functional/tmp.h" namespace marian { namespace gpu { template __global__ void gAggregateGeneric(Functor functor, // functor applied to single corresponding elements in tensors (via broadcasting), AccType aggInit, // aggInit is starting value of accumulation (e.g. 0 for sum), AggFunctor aggFunctor, // aggFunctor is used to accumulate values (e.g. sum), const functional::Shape full, // maximal combined shape of all tensors via broadcasting functional::Tensor out, // output tensor functional::Array, K> ins, // input tensors AccType scale = 1.0) { // scale accumulation result by scale. e.g. used for computing mean from sum over N elements with scale 1/N int outLength = out.shape().elements(); bool same = outLength == full.elements(); for(int i = 0; i < K; ++i) same = same && outLength == ins[i].shape().elements(); constexpr size_t N = functional::Shape::size(); functional::Array len; for(int i = 0; i < N; ++i) len[i] = full[i] / out.shape()[i]; functional::Array dims; for(int bid = 0; bid < outLength; bid += blockDim.x * gridDim.x) { int index = bid + blockDim.x * blockIdx.x + threadIdx.x; if(index < outLength) { if(same) { out[index] = (T)aggFunctor((AccType)out[index], functional::applyWithCast(functor, ins, index) * scale); // apply functors to with arguments cast to AccType } else { out.shape().dims(index, dims); out[index] = (T)aggFunctor((AccType)out[index], functional::loops(functor, aggInit, aggFunctor, ins, len, dims) * scale); // apply functors to with arguments cast to AccType } } } } template __global__ void gAggregateEqual(Functor functor, AggFunctor aggFunctor, functional::Tensor out, functional::Array, K> ins, AccType scale, bool broadcast) { int length = out.shape().elements(); functional::Array dims; for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) { int index = bid + blockDim.x * blockIdx.x + threadIdx.x; if(index < length) { functional::Array indices; indices.fill(index); if(broadcast) { out.shape().dims(index, dims); for(size_t i = 0; i < K; ++i) indices[i] = ins[i].shape().bindex(dims); } out[index] = (T)aggFunctor((AccType)out[index], functional::applyWithCast(functor, ins, indices) * scale); } } } template __global__ void gAggregateReduce(Functor functor, AccType aggInit, AggFunctor aggFunctor, const functional::Shape full, functional::Tensor out, functional::Array, K> ins, AccType scale = 1.0) { int rows = full.elements() / full.back(); int cols = full.back(); bool same = true; // do all inputs have the same number of elements? for(int i = 0; i < K; ++i) same = same && ins[i].shape().elements() == full.elements(); for(int bid = 0; bid < rows; bid += gridDim.x) { int j = bid + blockIdx.x; if(j < rows) { // make sure shared memory is the same for different types // by using bytes instead of type T extern __shared__ uint8_t _sharedBytes[]; AccType* _sum = (AccType*)_sharedBytes; if(same) { _sum[threadIdx.x] = aggInit; for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if(id < cols) _sum[threadIdx.x] = aggFunctor(_sum[threadIdx.x], functional::applyWithCast(functor, ins, j * cols + id)); // casts to AccType before applying functor which then performs operation in AccType } } else { functional::Array dims; _sum[threadIdx.x] = aggInit; for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if(id < cols) { full.dims(j * cols + id, dims); functional::Array indices; for(int i = 0; i < K; ++i) indices[i] = ins[i].shape().bindex(dims); _sum[threadIdx.x] = aggFunctor(_sum[threadIdx.x], functional::applyWithCast(functor, ins, indices));// casts to AccType before applying functor which then performs operation in AccType } } } __syncthreads(); int len = blockDim.x; while(len != 1) { __syncthreads(); int skip = (len + 1) >> 1; if(threadIdx.x < (len >> 1)) { _sum[threadIdx.x] = aggFunctor(_sum[threadIdx.x], _sum[threadIdx.x + skip]); } len = (len + 1) >> 1; } __syncthreads(); if(threadIdx.x == 0) // only set value when in thread 0 in block out[j] = aggFunctor(out[j], (T)(_sum[0] * scale)); } __syncthreads(); } } template void AggregateTyped(Functor functor, AccType aggInit, AggFunctor aggFunctor, AccType scale, marian::Tensor out, Tensors... tensors) { cudaSetDevice(out->getDeviceId().no); auto full = marian::Shape::broadcast({out, tensors...}); int length = out->shape().elements(); constexpr size_t K = sizeof...(Tensors); functional::Tensor gOut = out; functional::Array, K> gIns = {tensors...}; if(out->shape().elements() == 1) { // reduce everything into a single element AggregateAll(nullptr, functor, aggInit, aggFunctor, scale, out, tensors...); // @TODO: pass allocator in here, currently uses cudaMalloc } else if(full.back() != 1 && out->shape().back() == 1 && full.elements() / full.back() == length) { // element number of out and full shape on axis that are not reduced must match size_t m = full.elements() / full.back(); // how many rows are we iterating over? size_t k = full.back(); // how many columns are being reduced to 1 in each row? int blocks = std::min(MAX_BLOCKS, (int)m); int threads = std::min(MAX_THREADS, (int)k); int shared = sizeof(AccType) * threads; gAggregateReduce<<>>(functor, aggInit, aggFunctor, full, gOut, gIns, scale); } else if(out->shape() == full) { int threads = std::min(MAX_THREADS, length); int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); bool broadcast = false; for(int i = 0; i < K; ++i) broadcast = broadcast || gOut.shape() != gIns[i].shape(); gAggregateEqual<<>>(functor, aggFunctor, gOut, gIns, scale, broadcast); } else { int threads = std::min(MAX_THREADS, length); int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); gAggregateGeneric<<>>(functor, aggInit, aggFunctor, full, gOut, gIns, scale); } } template void Aggregate(Functor functor, float aggInit, AggFunctor aggFunctor, float scale, marian::Tensor out, Tensors... tensors) { if(out->type() == Type::float32) { AggregateTyped(functor, aggInit, aggFunctor, scale, out, tensors...); } else if(out->type() == Type::float16) { #if COMPILE_FP16 AggregateTyped(functor, aggInit, aggFunctor, scale, out, tensors...); #else ABORT("FP16 not supported with current hardware or CUDA version"); #endif } else { ABORT("Type {} not yet supported", out->type()); } } template void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) { auto addFunctor = functional::_1 + functional::_2; Aggregate(functor, 0.f, addFunctor, scale, out, tensors...); } #include "tensors/gpu/add.inc" } // namespace gpu } // namespace marian