.. _program_listing_file_src_tensors_gpu_tensor_operators.cu: Program Listing for File tensor_operators.cu ============================================ |exhale_lsh| :ref:`Return to documentation for file ` (``src/tensors/gpu/tensor_operators.cu``) .. |exhale_lsh| unicode:: U+021B0 .. UPWARDS ARROW WITH TIP LEFTWARDS .. code-block:: cpp #include "common/types.h" #include "tensors/tensor_operators.h" #include "functional/functional.h" #include "functional/tensor.h" #include "tensors/allocator.h" #include "tensors/gpu/backend.h" #include "tensors/gpu/cuda_helpers.h" #include "tensors/gpu/add_all.h" namespace marian { namespace gpu { namespace atomics { static inline __device__ void atomicAdd(float *address, float val) { ::atomicAdd(address, val); } #if COMPILE_FP16 // @TODO: copied from CuTorch, adapt this better, give credit. static inline __device__ void atomicAdd(half *address, half val) { #if __CUDA_ARCH__ >= 700 && CUDA_VERSION >= 10000 // compute capability 70 and higher with CUDA 10 ::atomicAdd(address, val); #else // __CUDA_ARCH__ < 700 unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2)); unsigned int old = *address_as_ui; unsigned int assumed; do { assumed = old; #if CUDA_VERSION < 9000 half hsum; hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); hsum = hsum + val; #else __half_raw hsum; hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); half tmpres = hsum + val; hsum = __half_raw(tmpres); #endif old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x; old = atomicCAS(address_as_ui, assumed, old); } while (assumed != old); #endif // __CUDA_ARCH__ } #endif // COMPILE_FP16 } template __global__ void gIsNaN(const T* in, int length, bool* isNaN, bool* isInf) { for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) { int index = bid + blockDim.x * blockIdx.x + threadIdx.x; if(index < length) { if(isnan((float)in[index])) *isNaN = true; if(isinf((float)in[index])) *isInf = true; } } } void IsNaN(const Tensor in, Ptr allocator, bool& isNaN, bool& isInf) { cudaSetDevice(in->getDeviceId().no); int length = in->size(); int threads = std::min(MAX_THREADS, length); int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); auto mem = allocator->alloc(2); bool* dIsNaN = &mem->data()[0]; bool* dIsInf = &mem->data()[1]; fill(in->getBackend(), dIsNaN, dIsNaN + 2, false); if(in->type() == Type::float32) { gIsNaN<<>>(in->data(), length, dIsNaN, dIsInf); #if COMPILE_FP16 } else if(in->type() == Type::float16) { gIsNaN<<>>(in->data(), length, dIsNaN, dIsInf); #endif } else { ABORT("IsNaN for type {} not implemented", in->type()); } CudaCopy(dIsNaN, dIsNaN + 1, &isNaN); CudaCopy(dIsInf, dIsInf + 1, &isInf); allocator->free(mem); cudaStreamSynchronize(0); } template __global__ void gSanitizeGradient(T* in, int length, bool* isNaN, bool* isInf, bool pruneNaN, bool clipInf, float forNaN = 0.f, float forInf = 65504.f, float forInfNeg = -65504.f) { for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) { int index = bid + blockDim.x * blockIdx.x + threadIdx.x; if(index < length) { float v = (float)in[index]; // handle NaN if(isnan(v)) { if(pruneNaN) { in[index] = (T)forNaN; } else { *isNaN = true; } } // handle +/- Inf if(isinf(v)) { if(clipInf) { in[index] = v > 0 ? (T)forInf : (T)forInfNeg; } else { *isInf = true; } } } } } // This function is meant to clean gradients, i.e. clip infinities and prune NaNs if required. // If all NaNs and Infs have been removed we return `true` for indicating a sane gradient. // If `clipInf` is set, infinities are replaced with the maximum/minimum non-inf value for the tensor. // In that case infinities do not result in a bad gradient, since they get clipped. // If `pruneNaN` is set, NaNs are replaced with 0. Since NaNs get removed now they do not result // in a bad gradient. // If NaNs or infinities are detected but not removed (either because of `pruneNaN=false` or `clipInf=false`), // we return `false` indicating a bad gradient. bool SanitizeGradient(marian::Tensor in, Ptr allocator, bool pruneNaN, bool clipInf) { cudaSetDevice(in->getDeviceId().no); int length = in->size(); int threads = std::min(MAX_THREADS, length); int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); auto mem = allocator->alloc(2); bool* dIsNaN = &mem->data()[0]; bool* dIsInf = &mem->data()[1]; fill(in->getBackend(), dIsNaN, dIsNaN + 2, false); float forNaN = 0.f; float forInf = NumericLimits(in->type()).max; float forInfNeg = NumericLimits(in->type()).lowest; if(in->type() == Type::float32) { gSanitizeGradient<<>>(in->data(), length, dIsNaN, dIsInf, pruneNaN, clipInf, forNaN, forInf, forInfNeg); #if COMPILE_FP16 } else if(in->type() == Type::float16) { gSanitizeGradient<<>>(in->data(), length, dIsNaN, dIsInf, pruneNaN, clipInf, forNaN, forInf, forInfNeg); #endif } else { ABORT("gSanitizeGradient for type {} not implemented", in->type()); } bool isNaN, isInf; CudaCopy(dIsNaN, dIsNaN + 1, &isNaN); CudaCopy(dIsInf, dIsInf + 1, &isInf); allocator->free(mem); cudaStreamSynchronize(0); return !isNaN && !isInf; } template __global__ void gCopyCastTo(To* out, const From* in, int length) { for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) { int index = bid + blockDim.x * blockIdx.x + threadIdx.x; if(index < length) { if(add) out[index] += (To)in[index]; else out[index] = (To)in[index]; } } } template void CopyCastTo(To* out, const From* in, int length) { int threads = std::min(MAX_THREADS, length); int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); gCopyCastTo<<>>(out, in, length); } template void CopyCastFrom(Tensor out, const T* in, int length) { if(out->type() == Type::float32) { CopyCastTo(out->data(), in, length); #if COMPILE_FP16 } else if(out->type() == Type::float16) { CopyCastTo(out->data(), in, length); #endif } else if(out->type() == Type::float64) { CopyCastTo(out->data(), in, length); } else { ABORT("CopyCastTo to type {} not implemented", out->type()); } } void CopyCast(Tensor out, const Tensor in) { cudaSetDevice(out->getDeviceId().no); if(in->type() == Type::float32) { CopyCastFrom(out, in->data(), (int)in->size()); #if COMPILE_FP16 } else if(in->type() == Type::float16) { CopyCastFrom(out, in->data(), (int)in->size()); #endif } else if(in->type() == Type::float64) { CopyCastFrom(out, in->data(), (int)in->size()); } else if(in->type() == Type::uint32) { CopyCastFrom(out, in->data(), (int)in->size()); } else { ABORT("CopyCastFrom from type {} not implemented", in->type()); } } void AddCast(Tensor out, const Tensor in) { cudaSetDevice(out->getDeviceId().no); if(in->type() == Type::float32) { CopyCastFrom(out, in->data(), (int)in->size()); #if COMPILE_FP16 } else if(in->type() == Type::float16) { CopyCastFrom(out, in->data(), (int)in->size()); #endif } else if(in->type() == Type::float64) { CopyCastFrom(out, in->data(), (int)in->size()); } else if(in->type() == Type::uint32) { CopyCastFrom(out, in->data(), (int)in->size()); } else { ABORT("CopyCastFrom from type {} not implemented", in->type()); } } void ConcatCont(Tensor out, const std::vector& inputs, int axis) { cudaSetDevice(out->getDeviceId().no); int step = 1; for(int i = 0; i < axis; ++i) step *= out->shape()[i]; size_t offset1 = 0; for(int i = 0; i < step; ++i) { for(auto in : inputs) { size_t size = (in->shape().elements() / step) * sizeOf(out->type()); size_t offset2 = i * size; cudaMemcpy(out->data() + offset1, in->data() + offset2, size, cudaMemcpyDeviceToDevice); offset1 += size; } } cudaStreamSynchronize(0); } template __global__ void gInsertCols(T* out, const T* in, size_t rows, size_t cols, size_t cols_out, size_t cols_in, size_t offset_out, size_t offset_in) { for(int bid = 0; bid < rows; bid += gridDim.x) { int j = bid + blockIdx.x; if(j < rows) { // @TODO: change to if j == rows then break, as that's what it means. In 4 functions in here. T* rowOut = out + j * cols_out + offset_out; const T* rowIn = in + j * cols_in + offset_in; for(int tid = 0; tid < cols; tid += blockDim.x) { int i = tid + threadIdx.x; if(i < cols) if(add) rowOut[i] += rowIn[i]; else rowOut[i] = rowIn[i]; } } } } // Special version for axis = -1 @TODO: write common better version void Concatenate1(Tensor out, const std::vector& inputs) { cudaSetDevice(out->getDeviceId().no); int rows = out->shape().elements() / out->shape().back(); size_t offset = 0; int cols_out = out->shape().back(); for(auto in : inputs) { ABORT_IF(rows != in->shape().elements() / in->shape().back(), "First dimension must be equal"); int cols_in = in->shape().back(); int blocks = std::min(MAX_BLOCKS, rows); int threads = std::min(MAX_THREADS, cols_in); if(out->type() == Type::float32) { gInsertCols<<>>(out->data(), in->data(), rows, cols_in, cols_out, cols_in, offset, 0); #if COMPILE_FP16 } else if(out->type() == Type::float16) { gInsertCols<<>>(out->data(), in->data(), rows, cols_in, cols_out, cols_in, offset, 0); #endif } else { ABORT("Concatenate1 not implemented for type {}", out->type()); } offset += cols_in; } cudaStreamSynchronize(0); } template __global__ void gJoin2(T* out, size_t rowBatch, size_t cols, const T* in1, size_t inStride1, const T* in2, size_t inStride2) { int outStride = inStride1 + inStride2; int rows = rowBatch * outStride; for(int bid = 0; bid < rows; bid += gridDim.x) { int j = bid + blockIdx.x; if(j < rows) { T* rowOut = out + j * cols; int curBatch = j / outStride; int curPos = j % outStride; int jIn1 = (curBatch * inStride1) + curPos; int jIn2 = (curBatch * inStride2) + curPos - inStride1; const T* rowIn1 = in1 + jIn1 * cols; const T* rowIn2 = in2 + jIn2 * cols; for(int tid = 0; tid < cols; tid += blockDim.x) { int i = tid + threadIdx.x; if(i < cols) { if(curPos < inStride1) rowOut[i] = rowIn1[i]; else rowOut[i] = rowIn2[i]; } } } } } // Special version for axis = -2 @TODO: write common better version void Concatenate2(Tensor out, Tensor in1, Tensor in2) { cudaSetDevice(out->getDeviceId().no); size_t rows = out->shape().elements() / out->shape().back(); size_t cols = out->shape().back(); size_t rowStride1 = in1->shape()[-2]; size_t rowStride2 = in2->shape()[-2]; size_t rowBatch = rows / out->shape()[-2]; int blocks = std::min(MAX_BLOCKS, (int)rows); int threads = std::min(MAX_THREADS, (int)cols); if(out->type() == Type::float32) { gJoin2<<>>(out->data(), rowBatch, cols, in1->data(), rowStride1, in2->data(), rowStride2); #if COMPILE_FP16 } else if(out->type() == Type::float16) { gJoin2<<>>(out->data(), rowBatch, cols, in1->data(), rowStride1, in2->data(), rowStride2); #endif } else { ABORT("Concatenate2 not implemented for type {}", out->type()); } cudaStreamSynchronize(0); } void Concatenate(Tensor out, const std::vector& inputs, int ax) { if(ax == out->shape().size() - 1) Concatenate1(out, inputs); else if(ax == out->shape().size() - 2 && inputs.size() == 2) Concatenate2(out, inputs[0], inputs[1]); else ConcatCont(out, inputs, ax); } void Split1(std::vector& outputs, const Tensor in) { cudaSetDevice(in->getDeviceId().no); size_t offset = 0; int rows = in->shape().elements() / in->shape().back(); int cols_in = in->shape().back(); for(auto out : outputs) { ABORT_IF(rows != out->shape().elements() / out->shape().back(), "First dimension must be equal"); int cols_out = out->shape().back(); int blocks = std::min(MAX_BLOCKS, rows); int threads = std::min(MAX_THREADS, cols_out); if(out->type() == Type::float32) { gInsertCols<<>>( out->data(), in->data(), rows, cols_out, cols_out, cols_in, 0, offset); #if COMPILE_FP16 } else if(out->type() == Type::float16) { gInsertCols<<>>( out->data(), in->data(), rows, cols_out, cols_out, cols_in, 0, offset); #endif } else { ABORT("Split1 not implemented for type {}", out->type()); } offset += cols_out; } cudaStreamSynchronize(0); } // @TODO: this function is just a temporary fix until I come up with // something better for the situation below. template __global__ void gAddRow(T* out, const T* in, int length) { for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) { int index = bid + blockDim.x * blockIdx.x + threadIdx.x; if(index < length) { out[index] = in[index] + out[index]; } } } void SplitCont(std::vector& outputs, const Tensor in, int axis) { cudaSetDevice(in->getDeviceId().no); int step = 1; for(int i = 0; i < axis; ++i) step *= in->shape()[i]; int offset1 = 0; for(int i = 0; i < step; ++i) { for(auto out : outputs) { int size = out->shape().elements() / step; int offset2 = i * size; // BUG: this is does not add gradients // cudaMemcpyAsync(out->data() + offset2, // in->data() + offset1, // size * sizeof(float), // cudaMemcpyDeviceToDevice); // @TODO: this is a quick but bad fix for the above bug int threads = std::min(MAX_THREADS, size); int blocks = std::min(MAX_BLOCKS, size / threads + (size % threads != 0)); if(out->type() == Type::float32) { gAddRow<<>>(out->data() + offset2, in->data() + offset1, size); #if COMPILE_FP16 } else if(out->type() == Type::float16) { gAddRow<<>>(out->data() + offset2, in->data() + offset1, size); #endif } else { ABORT("SplitCont not implemented for type {}", out->type()); } offset1 += size; } } cudaStreamSynchronize(0); } void Deconcatenate(std::vector& outputs, const Tensor in, int ax) { if(ax == in->shape().size() - 1) Split1(outputs, in); else SplitCont(outputs, in, ax); } template __global__ void gTransposeND( functional::Tensor out, const functional::Tensor in, const functional::Array permute) { constexpr size_t N = functional::Shape::size(); functional::Array oDims; functional::Array pDims; int length = out.shape().elements(); for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) { int index = bid + blockDim.x * blockIdx.x + threadIdx.x; if(index < length) { out.shape().dims(index, oDims); for(int i = 0; i < N; ++i) pDims[permute[i]] = oDims[i]; int inIndex = in.shape().index(pDims); // TODO: operates on raw indices, change to // converting Tensor::operator[] if(add) out.data()[index] += in.data()[inIndex]; else out.data()[index] = in.data()[inIndex]; } } } template __global__ void gTranspose0213(T* out, const T* in, int rows, int cols, int stride1, int stride2) { int stride = stride1 * stride2; for(int bid = 0; bid < rows; bid += gridDim.x) { int j = bid + blockIdx.x; if(j < rows) { T* rowOut = out + j * cols; int z = j / stride; int y = (j % stride) / stride1; int x = (j % stride) % stride1; int j2 = z * stride + x * stride2 + y; const T* rowIn = in + j2 * cols; for(int tid = 0; tid < cols; tid += blockDim.x) { int i = tid + threadIdx.x; if(i < cols) { if(add) rowOut[i] += rowIn[i]; else rowOut[i] = rowIn[i]; } } } } } void TransposeND(Tensor out, Tensor in, const std::vector& vAxis) { cudaSetDevice(out->getDeviceId().no); if(vAxis == std::vector({0, 2, 1, 3})) { int rows = out->shape().elements() / out->shape().back(); int cols = out->shape().back(); int blocks = std::min(MAX_BLOCKS, rows); int threads = std::min(MAX_THREADS, cols); int stride1 = out->shape()[-2]; int stride2 = out->shape()[-3]; if(in->type() == Type::float32) { gTranspose0213<<>>(out->data(), in->data(), rows, cols, stride1, stride2); } else if(in->type() == Type::uint32) { gTranspose0213<<>>(out->data(), in->data(), rows, cols, stride1, stride2); #if COMPILE_FP16 } else if(in->type() == Type::float16) { gTranspose0213<<>>(out->data(), in->data(), rows, cols, stride1, stride2); #endif } else { ABORT("Transpose for type {} not implemented", in->type()); } } else { functional::Array axes; int diff = functional::Shape::size() - vAxis.size(); for(int i = 0; i < axes.size(); ++i) if(i < diff) axes[i] = i; else axes[i] = vAxis[i - diff] + diff; int length = out->shape().elements(); int threads = std::min(MAX_THREADS, length); int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); if(in->type() == Type::float32) { gTransposeND<<>>(out, in, axes); } else if(in->type() == Type::uint32) { gTransposeND<<>>(out, in, axes); #if COMPILE_FP16 } else if(in->type() == Type::float16) { gTransposeND<<>>(out, in, axes); #endif } else { ABORT("Transpose for type {} not implemented", in->type()); } } } //@TODO: code duplication? void TransposeNDGrad(Tensor out, Tensor in, const std::vector& vAxis) { cudaSetDevice(out->getDeviceId().no); if(vAxis == std::vector({0, 2, 1, 3})) { int rows = out->shape().elements() / out->shape().back(); int cols = out->shape().back(); int blocks = std::min(MAX_BLOCKS, rows); int threads = std::min(MAX_THREADS, cols); int stride1 = out->shape()[-2]; int stride2 = out->shape()[-3]; if(in->type() == Type::float32) { gTranspose0213<<>>(out->data(), in->data(), rows, cols, stride1, stride2); #if COMPILE_FP16 } else if(in->type() == Type::float16) { gTranspose0213<<>>(out->data(), in->data(), rows, cols, stride1, stride2); #endif } else { ABORT("Transpose for type {} not implemented", in->type()); } } else { functional::Array axes; int diff = functional::Shape::size() - vAxis.size(); for(int i = 0; i < axes.size(); ++i) if(i < diff) axes[i] = i; else axes[i] = vAxis[i - diff] + diff; int length = out->shape().elements(); int threads = std::min(MAX_THREADS, length); int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); if(in->type() == Type::float32) { gTransposeND<<>>(out, in, axes); #if COMPILE_FP16 } else if(in->type() == Type::float16) { gTransposeND<<>>(out, in, axes); #endif } else { ABORT("Transpose for type {} not implemented", in->type()); } } } // Computes the softmax // in - input tensor // out - output tensor // we compute the softmax over the the cols (last dimension) // rows are time, batch or beam dimensions // number of threads is number of cols or MAX_THREADS // number of blocks is number of rows or MAX_BLOCKS // @TODO: handle half2 template __global__ void gSoftmax(T* out, functional::Shape outShape, const T* in) { using namespace functional; int rows = outShape.elements() / outShape.back(); int cols = outShape.back(); for(int bid = 0; bid < rows; bid += gridDim.x) { // loop over blocks of rows int j = bid + blockIdx.x; // blockIdx.x - row index (within block of rows) if(j < rows) { // compute softmax over one row, row elements distributed over threads T* so = out + j * cols; // pointer to row input data const T* sp = in + j * cols; // CUDA complains if type or size of shared memory changes, keep size constant. extern __shared__ uint8_t _sharedBytes[]; T* _share = (T*)_sharedBytes; AccType* _shareAccType = (AccType*)_sharedBytes; // determine max (used below to improve numeric stability) T* _max = _share; // @TODO: what's going on here with fp16? _max[threadIdx.x] = -CUDA_FLT_MAX; // mask // find max over column indices that have the same relative column index (=threadIdx.x) across all blocks of columns for(int tid = 0; tid < cols; tid += blockDim.x) { // threadIdx.x = column index within block of columns; we reduce over columns within a block, then over blocks int i = tid + threadIdx.x; if(i < cols) { if(sp[i] > _max[threadIdx.x]) _max[threadIdx.x] = sp[i]; } } __syncthreads(); // max over columns within a column block via tree reduction int len = blockDim.x; while(len != 1) { __syncthreads(); int skip = (len + 1) >> 1; if(threadIdx.x < (len >> 1)) { if(_max[threadIdx.x + skip] > _max[threadIdx.x]) { _max[threadIdx.x] = _max[threadIdx.x + skip]; } } len = (len + 1) >> 1; } __syncthreads(); T max = _max[0]; __syncthreads(); // compute denominator AccType* _sum = _shareAccType; // accumulate into AccType _sum[threadIdx.x] = 0.0; for(int tid = 0; tid < cols; tid += blockDim.x) { int i = tid + threadIdx.x; if(i < cols) { // @TODO: is it faster to cache the result of expf() in GPU RAM, or would it be faster to recompute it below? T ex = Ops::exp(sp[i] - max); so[i] = (T)ex; _sum[threadIdx.x] += (AccType)ex; // accumulate into AccType } } __syncthreads(); // now reduce over all columns within the block len = blockDim.x; while(len != 1) { __syncthreads(); int skip = (len + 1) >> 1; if(threadIdx.x < (len >> 1)) _sum[threadIdx.x] += _sum[threadIdx.x + skip]; len = (len + 1) >> 1; } __syncthreads(); // produce final output data AccType sum = _sum[0]; for(int tid = 0; tid < cols; tid += blockDim.x) { int i = tid + threadIdx.x; if(i < cols) { so[i] = (T)((AccType)so[i] / sum); // divide as AccType then convert } } } __syncthreads(); } } void Softmax(Tensor out, Tensor in) { cudaSetDevice(out->getDeviceId().no); size_t m = out->shape().elements() / out->shape().back(); size_t k = out->shape().back(); int blocks = std::min(MAX_BLOCKS, (int)m); int threads = std::min(MAX_THREADS, (int)k); int shared = sizeof(float) * threads; // accumulate into float if(in->type() == Type::float32) { gSoftmax<<>>(out->data(), out->shape(), in->data()); #if COMPILE_FP16 } else if (in->type() == Type::float16) { gSoftmax<<>>(out->data(), out->shape(), in->data()); #endif } else { ABORT("Softmax not implemented for type {}", in->type()); } } template __global__ void gSinusoidalPositionEmbeddings(T* out, functional::Shape outShape, int start) { using namespace functional; int rows = outShape.elements() / outShape.back(); int cols = outShape.back(); float numTimescales = (float)cols / 2.f; float logTimescaleIncrement = Ops::log(10000.f) / (numTimescales - 1.f); for(int bid = 0; bid < rows; bid += gridDim.x) { // loop over blocks of rows int j = bid + blockIdx.x; // blockIdx.x - row index (within block of rows) if(j < rows) { // compute softmax over one row, row elements distributed over threads T* outRow = out + j * cols; // pointer to row data for(int tid = 0; tid < cols; tid += blockDim.x) { int i = tid + threadIdx.x; if(i < cols) { float v = (float)(j + start) * Ops::exp((float)(i % (int)numTimescales) * -logTimescaleIncrement); outRow[i] = (T)(i < (int)numTimescales ? Ops::sin(v) : Ops::cos(v)); } } } } } void SinusoidalPositionEmbeddings(Tensor out, int start) { cudaSetDevice(out->getDeviceId().no); size_t rows = out->shape().elements() / out->shape().back(); size_t cols = out->shape().back(); int blocks = std::min(MAX_BLOCKS, (int)rows); int threads = std::min(MAX_THREADS, (int)cols); if(out->type() == Type::float32) { gSinusoidalPositionEmbeddings<<>>(out->data(), out->shape(), start); #if COMPILE_FP16 } else if (out->type() == Type::float16) { gSinusoidalPositionEmbeddings<<>>(out->data(), out->shape(), start); #endif } else { ABORT("SinusoidalPositionEmbeddings not implemented for type {}", out->type()); } } // @TODO: refactor to reuse code from softmax, add comments template __global__ void gLogSoftmax(T* out, const functional::Shape outShape, const T* in) { using namespace functional; int rows = outShape.elements() / outShape.back(); int cols = outShape.back(); for(int bid = 0; bid < rows; bid += gridDim.x) { int j = bid + blockIdx.x; if(j < rows) { T* so = out + j * cols; const T* sp = in + j * cols; // CUDA complains if type or size of shared memory changes, keep size constant. extern __shared__ uint8_t _sharedBytes[]; T* _share = (T*)_sharedBytes; AccType* _shareAccType = (AccType*)_sharedBytes; T* _max = _share; // 16-bit is ok for max if applicable _max[threadIdx.x] = sp[threadIdx.x]; for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if(id < cols) { if(sp[id] > _max[threadIdx.x]) _max[threadIdx.x] = sp[id]; } } __syncthreads(); int len = blockDim.x; while(len != 1) { __syncthreads(); int skip = (len + 1) >> 1; if(threadIdx.x < (len >> 1)) { if(_max[threadIdx.x + skip] > _max[threadIdx.x]) { _max[threadIdx.x] = _max[threadIdx.x + skip]; } } len = (len + 1) >> 1; } __syncthreads(); T max = _max[0]; __syncthreads(); AccType* _sum = _shareAccType; // keep AccType for accumulation _sum[threadIdx.x] = 0.0; for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if(id < cols) { T sm = sp[id] - max; AccType ex = Ops::exp(sm); // sum with AccType so[id] = sm; _sum[threadIdx.x] += ex; // sum with AccType } } __syncthreads(); len = blockDim.x; while(len != 1) { __syncthreads(); int skip = (len + 1) >> 1; if(threadIdx.x < (len >> 1)) _sum[threadIdx.x] += _sum[threadIdx.x + skip]; len = (len + 1) >> 1; } __syncthreads(); AccType sum = _sum[0]; for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if(id < cols) so[id] -= (T)Ops::log(sum); // take log at the end and convert } } __syncthreads(); } } void LogSoftmax(Tensor out, Tensor in) { cudaSetDevice(out->getDeviceId().no); size_t m = out->shape().elements() / out->shape().back(); size_t k = out->shape().back(); int blocks = std::min(MAX_BLOCKS, (int)m); int threads = std::min(MAX_THREADS, (int)k); int shared = sizeof(float) * threads; // use float32 as accumulation type if(in->type() == Type::float32) { gLogSoftmax<<>>(out->data(), out->shape(), in->data()); #if COMPILE_FP16 } else if (in->type() == Type::float16) { gLogSoftmax<<>>(out->data(), out->shape(), in->data()); #endif } else { ABORT("LogSoftmax not implemented for type {}", in->type()); } } template __global__ void gSoftmaxGrad(T* grad, const T* adj, const T* val, const int rows, const int cols) { for(int bid = 0; bid < rows; bid += gridDim.x) { int j = bid + blockIdx.x; if(j < rows) { extern __shared__ uint8_t _sharedBytes[]; AccType* _sum = (AccType*)_sharedBytes; T* gradRow = grad + j * cols; const T* adjRow = adj + j * cols; const T* valRow = val + j * cols; _sum[threadIdx.x] = (AccType)0.0f; for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if(id < cols) { _sum[threadIdx.x] += (AccType)valRow[id] * (AccType)adjRow[id]; } } __syncthreads(); int len = blockDim.x; while(len != 1) { __syncthreads(); int skip = (len + 1) >> 1; if(threadIdx.x < (len >> 1)) _sum[threadIdx.x] += _sum[threadIdx.x + skip]; // accumulates in AccType len = (len + 1) >> 1; } __syncthreads(); for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if(id < cols) { AccType val = (AccType)valRow[id] * ((AccType)adjRow[id] - _sum[0]); if(val) gradRow[id] += (T)val; } } } __syncthreads(); } } // @TODO: refactor with logsoftmax, add math void SoftmaxGrad(Tensor grad, Tensor adj, Tensor val) { cudaSetDevice(adj->getDeviceId().no); // grad and val are both m-by-k matrices, passed as input. // A weighted average of each row of grad (according to the weights // specified in val) is computed and subtracted from Out. // adj is multiplied for each element to get backward step in autodiff int m = grad->shape().elements() / grad->shape().back(); int k = grad->shape().back(); int blocks = std::min(MAX_BLOCKS, m); int threads = std::min(MAX_THREADS, k); int shared = sizeof(float) * threads; if(grad->type() == Type::float32) { gSoftmaxGrad<<>>( grad->data(), adj->data(), val->data(), m, k); #if COMPILE_FP16 } else if (grad->type() == Type::float16) { // Accumulate into float gSoftmaxGrad<<>>( grad->data(), adj->data(), val->data(), m, k); #endif } else { ABORT("SoftmaxGrad not implemented for type {}", grad->type()); } } template __global__ void gLogSoftmaxGrad(T* grad, const T* adj, const T* val, const int rows, const int cols) { for(int bid = 0; bid < rows; bid += gridDim.x) { int j = bid + blockIdx.x; if(j < rows) { extern __shared__ uint8_t _sharedBytes[]; AccType* _sum = (AccType*)_sharedBytes; T* gradRow = grad + j * cols; const T* adjRow = adj + j * cols; const T* valRow = val + j * cols; _sum[threadIdx.x] = 0.0; for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if(id < cols) { _sum[threadIdx.x] += (AccType)adjRow[id]; } } __syncthreads(); int len = blockDim.x; while(len != 1) { __syncthreads(); int skip = (len + 1) >> 1; if(threadIdx.x < (len >> 1)) _sum[threadIdx.x] += _sum[threadIdx.x + skip]; // AccType len = (len + 1) >> 1; } __syncthreads(); for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if(id < cols) gradRow[id] += (T)((AccType)adjRow[id] - (functional::Ops::exp((AccType)valRow[id]) * _sum[0])); } } __syncthreads(); } } void LogSoftmaxGrad(Tensor grad, Tensor adj, Tensor val) { cudaSetDevice(adj->getDeviceId().no); // grad and val are both m-by-k matrices, passed as input. // A weighted average of each row of grad (according to the weights // specified in val) is computed and subtracted from Out. // adj is multiplied for each element to get backward step in autodiff int m = grad->shape().elements() / grad->shape().back(); int k = grad->shape().back(); int blocks = std::min(MAX_BLOCKS, m); int threads = std::min(MAX_THREADS, k); int shared = sizeof(float) * threads; // Use float32 as accumulation type if(grad->type() == Type::float32) { gLogSoftmaxGrad<<>>( grad->data(), adj->data(), val->data(), m, k); #if COMPILE_FP16 } else if (grad->type() == Type::float16) { // accumulate into float gLogSoftmaxGrad<<>>( grad->data(), adj->data(), val->data(), m, k); #endif } else { ABORT("LogSoftmaxGrad not implemented for type {}", grad->type()); } } template __global__ void gCopyRows(T* out, const T* in, size_t cols, const IndexType* sourceRowIdx, size_t rows) { for(int bid = 0; bid < rows; bid += gridDim.x) { int j = bid + blockIdx.x; if(j < rows) { size_t dstId = j; size_t srcId = sourceRowIdx[j]; T* rowOut = out + dstId * cols; const T* rowIn = in + srcId * cols; for(int tid = 0; tid < cols; tid += blockDim.x) { int i = tid + threadIdx.x; if(i < cols) rowOut[i] = rowIn[i]; } } } } void CopyRows(Tensor out, const Tensor in, const Tensor indices) { matchOrAbort(indices->type()); cudaSetDevice(out->getDeviceId().no); size_t cols = in->shape().back(); size_t rowsToCopy = indices->size(); int threads = std::min(MAX_THREADS, (int)cols); int blocks = std::min(MAX_BLOCKS, (int)rowsToCopy); if(out->type() == Type::float32) { gCopyRows<<>>( out->data(), in->data(), cols, indices->data(), rowsToCopy); #if COMPILE_FP16 } else if (out->type() == Type::float16) { gCopyRows<<>>( out->data(), in->data(), cols, indices->data(), rowsToCopy); #endif } else { ABORT("CopyRows not implemented for type {}", out->type()); } } template __global__ void gPasteRows(T* out, const T* in, size_t cols, const IndexType* targetRowIdx, size_t rows) { for(int bid = 0; bid < rows; bid += gridDim.x) { int j = bid + blockIdx.x; // index into 'indices' vector if(j < rows) { size_t dstId = targetRowIdx[j]; size_t srcId = j; T* rowOut = out + dstId * cols; const T* rowIn = in + srcId * cols; // aggregate the entire row for(int tid = 0; tid < cols; tid += blockDim.x) { int i = tid + threadIdx.x; // column index --@TODO: column index should be called 'j' if(i < cols) { // Note: atomicAdd() not needed if number of blocks is 1. Avoid it because it is slow for fp16. if (gridDim.x == 1) rowOut[i] += rowIn[i]; else atomics::atomicAdd(rowOut + i, rowIn[i]); } } } } } void PasteRows(Tensor out, const Tensor in, const Tensor indices) { matchOrAbort(indices->type()); cudaSetDevice(out->getDeviceId().no); size_t cols = in->shape().back(); size_t rowsToCopy = indices->size(); int threads = std::min(MAX_THREADS, (int)cols); #if DETERMINISTIC // If we only use one block, then each core operates on a different column, // hence the summation becomes deterministic. // However, we only use e.g. 512 cores out of possibly 3000+, so this will be // 6 x slower in this example. int blocks = 1; #else int blocks = std::min(MAX_BLOCKS, (int)rowsToCopy); #endif if(out->type() == Type::float32) { gPasteRows<<>>( out->data(), in->data(), cols, indices->data(), rowsToCopy); #if COMPILE_FP16 } else if (out->type() == Type::float16) { gPasteRows<<>>( out->data(), in->data(), cols, indices->data(), rowsToCopy); #endif } else { ABORT("CopyRows not implemented for type {}", out->type()); } } template __global__ void gCopyCols(T* out, const T* in, size_t rows, size_t colsIn, const IndexType* sourceColIdx, size_t colsOut) { for(int bid = 0; bid < rows; bid += gridDim.x) { int j = bid + blockIdx.x; if(j < rows) { const T* rowIn = in + j * colsIn; T* rowOut = out + j * colsOut; for(int tid = 0; tid < colsOut; tid += blockDim.x) { int i = tid + threadIdx.x; if(i < colsOut) rowOut[i] = rowIn[sourceColIdx[i]]; } } } } void CopyCols(Tensor out, const Tensor in, const Tensor indices) { matchOrAbort(indices->type()); cudaSetDevice(out->getDeviceId().no); size_t rows = in->shape().elements() / in->shape().back(); size_t cols = in->shape().back(); size_t colsToCopy = indices->size(); int threads = std::min(MAX_THREADS, (int)colsToCopy); int blocks = std::min(MAX_BLOCKS, (int)rows); if(out->type() == Type::float32) { gCopyCols<<>>( out->data(), in->data(), rows, cols, indices->data(), colsToCopy); #if COMPILE_FP16 } else if (out->type() == Type::float16) { gCopyCols<<>>( out->data(), in->data(), rows, cols, indices->data(), colsToCopy); #endif } else { ABORT("CopyCols not implemented for type {}", out->type()); } } template __global__ void gPasteCols(T* out, const T* in, size_t rows, size_t colsOut, const IndexType* targetColIdx, size_t colsIn) { for(int bid = 0; bid < rows; bid += gridDim.x) { int j = bid + blockIdx.x; if(j < rows) { const T* rowIn = in + j * colsIn; T* rowOut = out + j * colsOut; for(int tid = 0; tid < colsIn; tid += blockDim.x) { int i = tid + threadIdx.x; if(i < colsIn) rowOut[targetColIdx[i]] += rowIn[i]; // @TODO: atomicAdd? } } } } void PasteCols(Tensor out, const Tensor in, const Tensor indices) { matchOrAbort(indices->type()); cudaSetDevice(out->getDeviceId().no); size_t rows = in->shape().elements() / in->shape().back(); size_t cols = in->shape().back(); size_t colsToCopy = indices->size(); int threads = std::min(MAX_THREADS, (int)colsToCopy); int blocks = std::min(MAX_BLOCKS, (int)rows); if(out->type() == Type::float32) { gPasteCols<<>>( out->data(), in->data(), rows, cols, indices->data(), colsToCopy); #if COMPILE_FP16 } else if (out->type() == Type::float16) { gPasteCols<<>>( out->data(), in->data(), rows, cols, indices->data(), colsToCopy); #endif } else { ABORT("PasteCols not implemented for type {}", out->type()); } } template __global__ void gSelect(T* out, functional::Shape outShape, const T* in, const functional::Shape inShape, int axis, const IndexType* d_indices, const functional::Shape idxShape) { int length = outShape.elements(); functional::Array dims; for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) { int index = bid + blockDim.x * blockIdx.x + threadIdx.x; if(index < length) { outShape.dims(index, dims); int idxIndex = idxShape.bindex(dims); // broadcast index into indices tensor dims[axis] = (int)d_indices[idxIndex]; int inIndex = inShape.index(dims); out[index] = in[inIndex]; } } } template __global__ void gInsert(T* out, functional::Shape outShape, const T* in, const functional::Shape inShape, int axis, const IndexType* d_indices, const functional::Shape idxShape) { int length = inShape.elements(); functional::Array dims; for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) { int index = bid + blockDim.x * blockIdx.x + threadIdx.x; if(index < length) { inShape.dims(index, dims); int idxIndex = idxShape.bindex(dims); // broadcast index into indices tensor dims[axis] = (int)d_indices[idxIndex]; int outIndex = outShape.index(dims); if(add) out[outIndex] += in[index]; // this is probably wrong, atomicAdd? else out[outIndex] = in[index]; } } } void Select(Tensor out, const Tensor in, const Tensor indices, int axis) { matchOrAbort(indices->type()); cudaSetDevice(out->getDeviceId().no); int length = out->shape().elements(); int threads = std::min(MAX_THREADS, length); int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); int axisGPU = axis + functional::Shape::size() - out->shape().size(); if(out->type() == Type::float32) { gSelect<<>>(out->data(), out->shape(), in->data(), in->shape(), axisGPU, indices->data(), indices->shape()); #if COMPILE_FP16 } else if (out->type() == Type::float16) { gSelect<<>>(out->data(), out->shape(), in->data(), in->shape(), axisGPU, indices->data(), indices->shape()); #endif } else if(out->type() == Type::uint32) { gSelect<<>>(out->data(), out->shape(), in->data(), in->shape(), axisGPU, indices->data(), indices->shape()); } else { ABORT("Select not implemented for type {}", out->type()); } } template void Insert(Tensor out, const Tensor in, const Tensor indices, int axis) { matchOrAbort(indices->type()); cudaSetDevice(in->getDeviceId().no); int length = in->shape().elements(); int threads = std::min(MAX_THREADS, length); int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); int axisGPU = axis + functional::Shape::size() - out->shape().size(); if(out->type() == Type::float32) { gInsert<<>>(out->data(), out->shape(), in->data(), in->shape(), axisGPU, indices->data(), indices->shape()); #if COMPILE_FP16 } else if (out->type() == Type::float16) { gInsert<<>>(out->data(), out->shape(), in->data(), in->shape(), axisGPU, indices->data(), indices->shape()); #endif } else { ABORT("Insert not implemented for type {}", out->type()); } } template void Insert(Tensor out, const Tensor in, const Tensor indices, int axis); template void Insert(Tensor out, const Tensor in, const Tensor indices, int axis); template __global__ void gGRUFastForward(T* out, const T* state, const T* xW, const T* sU, const T* b, const T* mask, size_t rows, size_t cols, bool final) { for(int bid = 0; bid < rows; bid += gridDim.x) { int j = bid + blockIdx.x; if(j < rows) { float m = !mask || mask[j]; T* rowOut = out + j * cols; const T* rowState = state + j * cols; const T* xWrow = xW + j * cols * 3; const T* sUrow = sU + j * cols * 3; for(int tid = 0; tid < cols; tid += blockDim.x) { int i = tid + threadIdx.x; if(i < cols) { float r = functional::Ops::sigmoid((float)xWrow[i] + (float)sUrow[i] + (float)b[i]); int k = i + cols; float z = functional::Ops::sigmoid((float)xWrow[k] + (float)sUrow[k] + (float)b[k]); int l = i + 2 * cols; float h; if(final) h = functional::Ops::tanh((float)xWrow[l] + ((float)sUrow[l] + (float)b[l]) * r); else h = functional::Ops::tanh((float)xWrow[l] + (float)sUrow[l] * r + (float)b[l]); float out = (1.f - z) * h + z * (float)rowState[i]; rowOut[i] = (T)(m * out + (1.f - m) * (float)rowState[i]); } } } } } void GRUFastForward(Tensor out, std::vector inputs, bool final) { cudaSetDevice(out->getDeviceId().no); int rows = out->shape().elements() / out->shape().back(); int cols = out->shape().back(); int blocks = std::min(MAX_BLOCKS, rows); int threads = std::min(MAX_THREADS, cols); if(out->type() == Type::float32) { gGRUFastForward<<>>( out->data(), // output inputs[0]->data(), // state inputs[1]->data(), // xW inputs[2]->data(), // sU inputs[3]->data(), // b inputs.size() > 4 ? inputs[4]->data() : 0, // mask rows, cols, final); #if COMPILE_FP16 } else if (out->type() == Type::float16) { gGRUFastForward<<>>( out->data(), // output inputs[0]->data(), // state inputs[1]->data(), // xW inputs[2]->data(), // sU inputs[3]->data(), // b inputs.size() > 4 ? inputs[4]->data() : 0, // mask rows, cols, final); #endif } else { ABORT("GRUFastForward not implemented for type {}", out->type()); } } template __global__ void gGRUFastBackward(T* outState, T* outXW, T* outSU, T* outB, const T* state, const T* xW, const T* sU, const T* b, const T* mask, const T* adj, size_t rows, size_t cols, bool final) { for(int bid = 0; bid < rows; bid += gridDim.x) { int j = bid + blockIdx.x; if(j < rows) { float m = !mask || mask[j]; T* rowOutState = outState + j * cols; T* rowOutXW = outXW + j * cols * 3; T* rowOutSU = outSU + j * cols * 3; T* rowOutB = outB ? outB + j * cols * 3 : nullptr; const T* rowState = state + j * cols; const T* rowXW = xW + j * cols * 3; const T* rowSU = sU + j * cols * 3; const T* rowAdj = adj + j * cols; for(int tid = 0; tid < cols; tid += blockDim.x) { int i = tid + threadIdx.x; if(i < cols) { int k = i + cols; int l = i + 2 * cols; float r = functional::Ops::sigmoid((float)rowXW[i] + (float)rowSU[i] + (float)b[i]); float z = functional::Ops::sigmoid((float)rowXW[k] + (float)rowSU[k] + (float)b[k]); float h; if(final) h = functional::Ops::tanh((float)rowXW[l] + ((float)rowSU[l] + (float)b[l]) * r); else h = functional::Ops::tanh((float)rowXW[l] + (float)rowSU[l] * r + (float)b[l]); float adj = rowAdj[i]; float t = (1.f - z) * (1.f - h * h); // df/ds if(outState) rowOutState[i] += (T)((m * z - m + 1.f) * adj); // df/d(xW_r) ... float dfdxW_r = m * r * (1.f - r) * t * adj; if(final) dfdxW_r *= (float)rowSU[l] + (float)b[l]; else dfdxW_r *= (float)rowSU[l]; if(outXW) rowOutXW[i] += (T)dfdxW_r; if(outSU) rowOutSU[i] += (T)dfdxW_r; if(outB) rowOutB[i] += (T)dfdxW_r; // df/d(xW_z) ... float dfdxW_z = m * (1.f - z) * z * ((float)rowState[i] - h) * adj; if(outXW) rowOutXW[k] += (T)dfdxW_z; if(outSU) rowOutSU[k] += (T)dfdxW_z; if(outB) rowOutB[k] += (T)dfdxW_z; // df/d(xW_x) ... float dfdxW_x = m * t * adj; if(outXW) rowOutXW[l] += (T)dfdxW_x; if(outSU) rowOutSU[l] += (T)(dfdxW_x * r); if(outB) if(final) rowOutB[l] += (T)(dfdxW_x * r); else rowOutB[l] += (T)dfdxW_x; } } } } } void GRUFastBackward(Ptr allocator, std::vector outputs, std::vector inputs, Tensor adj, bool final) { cudaSetDevice(adj->getDeviceId().no); int rows = adj->shape().elements() / adj->shape().back(); int cols = adj->shape().back(); int blocks = std::min(MAX_BLOCKS, rows); int threads = std::min(MAX_THREADS, cols); Tensor tempGradBias, tempOnes; MemoryPiece::PtrType tempGradBiasMemory, tempOnesMemory; if(outputs[3]) { Shape memShape = {rows, outputs[3]->shape()[-1]}; tempGradBiasMemory = allocator->alloc(memShape.elements() * sizeOf(outputs[3]->type())); tempGradBias = TensorBase::New(tempGradBiasMemory, memShape, outputs[3]->type(), outputs[3]->getBackend()); tempGradBias->set(0.f); tempOnesMemory = allocator->alloc(rows * sizeOf(outputs[3]->type())); tempOnes = TensorBase::New(tempOnesMemory, Shape({1, rows}), outputs[3]->type(), outputs[3]->getBackend()); tempOnes->set(1.f); } if(adj->type() == Type::float32) { gGRUFastBackward<<>>( outputs[0] ? outputs[0]->data() : 0, // state - adj outputs[1] ? outputs[1]->data() : 0, // xW - adj outputs[2] ? outputs[2]->data() : 0, // sU - adj outputs[3] ? tempGradBias->data() : 0, // b - adj inputs[0]->data(), // state inputs[1]->data(), // xW inputs[2]->data(), // sU inputs[3]->data(), // b inputs.size() > 4 ? inputs[4]->data() : 0, // mask adj->data(), rows, cols, final); #if COMPILE_FP16 } else if (adj->type() == Type::float16) { gGRUFastBackward<<>>( outputs[0] ? outputs[0]->data() : 0, // state - adj outputs[1] ? outputs[1]->data() : 0, // xW - adj outputs[2] ? outputs[2]->data() : 0, // sU - adj outputs[3] ? tempGradBias->data() : 0, // b - adj inputs[0]->data(), // state inputs[1]->data(), // xW inputs[2]->data(), // sU inputs[3]->data(), // b inputs.size() > 4 ? inputs[4]->data() : 0, // mask adj->data(), rows, cols, final); #endif } else { ABORT("gGRUFastBackward not implemented for type {}", adj->type()); } // We use this go get rid of the atomicAdd and perform a reduce of the gradients afterwards. // This is much faster for fp16 which seems to have a broken atomicAdd implementation. // We reduce bias gradients with a matrix multiply, but use a 32-bit compute type. // This preserves precision with larger batches where all batch entries reduce into a single vector. // See also AffineNodeOp where we do the same for biases if(outputs[3]) { gpu::Prod(outputs[3], tempOnes, tempGradBias, false, false, 1, 1, Type::float32); // beta set to one to add allocator->free(tempGradBiasMemory); allocator->free(tempOnesMemory); } } template __global__ void gCrossEntropyPick(AccType* out, const functional::Shape outShape, const T* in, const functional::Shape inShape, const IndexType* pick, AccType labelSmoothingAlpha = AccType(0.f)) { int rows = inShape.elements() / inShape.back(); int cols = inShape.back(); extern __shared__ uint8_t _sharedBytes[]; for(int bid = 0; bid < rows; bid += gridDim.x) { int j = bid + blockIdx.x; if(j < rows) { const T* sp = in + j * cols; T* _max = (T*)_sharedBytes; _max[threadIdx.x] = sp[threadIdx.x]; for(int tid = 1; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if(id < cols) { if(sp[id] > _max[threadIdx.x]) _max[threadIdx.x] = sp[id]; } } __syncthreads(); int len = blockDim.x; while(len != 1) { __syncthreads(); int skip = (len + 1) >> 1; if(threadIdx.x < (len >> 1)) { if(_max[threadIdx.x + skip] > _max[threadIdx.x]) { _max[threadIdx.x] = _max[threadIdx.x + skip]; } } len = (len + 1) >> 1; } __syncthreads(); T max = _max[0]; __syncthreads(); AccType* _acc = (AccType*)_sharedBytes; _acc[2 * threadIdx.x ] = (AccType)0.0f; _acc[2 * threadIdx.x + 1] = (AccType)0.0f; for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if(id < cols) { _acc[2 * threadIdx.x ] += functional::Ops::exp(sp[id] - max); _acc[2 * threadIdx.x + 1] += (AccType)(sp[id] - max); } } __syncthreads(); len = blockDim.x; while(len != 1) { __syncthreads(); int skip = (len + 1) >> 1; if(threadIdx.x < (len >> 1)) { _acc[2 * threadIdx.x ] += _acc[2 * (threadIdx.x + skip) ]; _acc[2 * threadIdx.x + 1] += _acc[2 * (threadIdx.x + skip) + 1]; } len = (len + 1) >> 1; } __syncthreads(); AccType sumexp = _acc[0]; // H(u, p) = 1/N * logsoftmax(h) = mean(h - max) - log(sum(exp(h - max))) AccType mean = _acc[1] / (AccType)cols; // mean(h - max) for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if(id == (int)pick[j]) { AccType logsumexp = functional::Ops::log(sumexp); AccType ce = logsumexp - (AccType)sp[id] + (AccType)max; // cross-entropy H(y^, p) AccType ls = logsumexp - mean; // label smoothing H(u, p) out[j] = (1.f - labelSmoothingAlpha) * ce + labelSmoothingAlpha * ls; // (1 - alpha) * H(y^, p) + alpha * H(u, p) } } } __syncthreads(); } } // In each j-th row, take the corresponding j-th label index i from indices and compute: // For each vocabulary item v, the only non-zero element in a row in the sum is the item // that matches the label indexed by i (the picked element). // C = sum_{v in V}(-logsoftmax(A) * delta(v, i) = -logsoftmax(A)[i] void CrossEntropyPick(Tensor out, Tensor in, Tensor indices, float labelSmoothingAlpha) { matchOrAbort(indices->type()); cudaSetDevice(out->getDeviceId().no); int rows = in->shape().elements() / in->shape().back(); int cols = in->shape().back(); int blocks = std::min(MAX_BLOCKS, (int)rows); int threads = std::min(MAX_THREADS, (int)cols); int shared = sizeof(float) * threads * 2; // Use float32 as accumulation type if(out->type() == Type::float32 && in->type() == Type::float32) { gCrossEntropyPick<<>>( out->data(), out->shape(), in->data(), in->shape(), indices->data(), labelSmoothingAlpha); #if COMPILE_FP16 } else if(out->type() == Type::float32 && in->type() == Type::float16) { gCrossEntropyPick<<>>( out->data(), out->shape(), in->data(), in->shape(), indices->data(), labelSmoothingAlpha); #endif } else { ABORT("CrossEntropyPick not implemented for input type {} and output type{}", in->type(), out->type()); } } template __global__ void gCrossEntropyPickBackward(T* out, const functional::Shape outShape, const AccType* adj, const T* in, const IndexType* pick, AccType labelSmoothingAlpha = AccType(0.f)) { int rows = outShape.elements() / outShape.back(); int cols = outShape.back(); extern __shared__ uint8_t _sharedBytes[]; for(int bid = 0; bid < rows; bid += gridDim.x) { int j = bid + blockIdx.x; if(j < rows) { const T* sp = in + j * cols; T* so = out + j * cols; T* _max = (T*)_sharedBytes; _max[threadIdx.x] = sp[threadIdx.x]; for(int tid = 1; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if(id < cols) { if(sp[id] > _max[threadIdx.x]) _max[threadIdx.x] = sp[id]; } } __syncthreads(); int len = blockDim.x; while(len != 1) { __syncthreads(); int skip = (len + 1) >> 1; if(threadIdx.x < (len >> 1)) { if(_max[threadIdx.x + skip] > _max[threadIdx.x]) { _max[threadIdx.x] = _max[threadIdx.x + skip]; } } len = (len + 1) >> 1; } __syncthreads(); T max = _max[0]; __syncthreads(); AccType* _sum = (AccType*)_sharedBytes; _sum[threadIdx.x] = 0.0; for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if(id < cols) { AccType ex = functional::Ops::exp(sp[id] - max); _sum[threadIdx.x] += ex; } } __syncthreads(); len = blockDim.x; while(len != 1) { __syncthreads(); int skip = (len + 1) >> 1; if(threadIdx.x < (len >> 1)) _sum[threadIdx.x] += _sum[threadIdx.x + skip]; len = (len + 1) >> 1; } __syncthreads(); // cross-entropy for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if(id < cols) { AccType sub = (AccType)(id == (int)pick[j]); AccType dce = functional::Ops::exp(sp[id] - max) / _sum[0] - sub; AccType dls = labelSmoothingAlpha * (sub - 1.f / (AccType)cols); so[id] += (T)(adj[j] * (dce + dls)); } } } __syncthreads(); } } void CrossEntropyPickBackward(Tensor out, Tensor adj, Tensor a, Tensor indices, float labelSmoothingAlpha) { matchOrAbort(indices->type()); cudaSetDevice(out->getDeviceId().no); int rows = out->shape().elements() / out->shape().back(); int cols = out->shape().back(); int blocks = std::min(MAX_BLOCKS, (int)rows); int threads = std::min(MAX_THREADS, (int)cols); int shared = sizeof(float) * threads; // use float as accumulation type if(out->type() == Type::float32 && adj->type() == Type::float32) { gCrossEntropyPickBackward<<>>( out->data(), out->shape(), adj->data(), a->data(), indices->data(), labelSmoothingAlpha); #if COMPILE_FP16 } else if(out->type() == Type::float16 && adj->type() == Type::float32) { gCrossEntropyPickBackward<<>>( out->data(), out->shape(), adj->data(), a->data(), indices->data(), labelSmoothingAlpha); #endif } else { ABORT("CrossEntropyPickBackward not implemented for type {} and adjoint type {}", out->type(), adj->type()); } } // computes the L2Norm of tensor and returns value as flaot on the CPU, // this is mostly used for diagnostic purposes and gradient clipping float L2Norm(Tensor in, Ptr allocator) { // @TODO: reverse order of arguments cudaSetDevice(in->getDeviceId().no); int size = in->shape().elements(); int threads = std::min(MAX_THREADS, size); int blocks = std::min(MAX_BLOCKS, size / threads + (size % threads != 0)); using namespace functional; float l2Norm; if(in->type() == Type::float32) { l2Norm = std::sqrt(AggregateAllAndReturn(allocator, /*functor=*/_1 * _1, /*aggInit=*/0.f, /*aggFunctor=*/_1 + _2, /*scale=*/1.f, in)); #if COMPILE_FP16 } else if(in->type() == Type::float16) { l2Norm = std::sqrt(AggregateAllAndReturn(allocator, /*functor=*/_1 * _1, /*aggInit=*/0.f, /*aggFunctor=*/_1 + _2, /*scale=*/1.f, in)); #endif } else { ABORT("L2Norm not implemented for type {}", in->type()); } return l2Norm; } template __global__ void gAtt(T* out, const T* va, const T* ctx, const T* state, int m, // total rows (batch x time x beam) int k, // depth int b, // batch size int t // time of ctx ) { int rows = m; int cols = k; for(int bid = 0; bid < rows; bid += gridDim.x) { int j = bid + blockIdx.x; if(j < rows) { const T* vaRow = va; const T* ctxRow = ctx + (j % (b * t)) * cols; const T* stateRow = state + ((j / (b * t)) * b + j % b) * cols; extern __shared__ AccType _share[]; AccType* _sum = _share; _sum[threadIdx.x] = 0.f; for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if(id < cols) { AccType z = (AccType)ctxRow[id] + (AccType)stateRow[id]; AccType ex = functional::Ops::tanh(z) * (AccType)vaRow[id]; _sum[threadIdx.x] += ex; } } __syncthreads(); int len = blockDim.x; while(len != 1) { __syncthreads(); int skip = (len + 1) >> 1; if(threadIdx.x < (len >> 1)) _sum[threadIdx.x] += _sum[threadIdx.x + skip]; len = (len + 1) >> 1; } __syncthreads(); out[j] = (T)_sum[0]; } __syncthreads(); } } void Att(Tensor out, Tensor va, Tensor context, Tensor state) { cudaSetDevice(out->getDeviceId().no); size_t totalRows = out->shape().elements() / out->shape().back(); // number of rows size_t modelDim = context->shape()[-1]; // number of cols size_t batchDim = context->shape()[-2]; size_t contextWordsDim = context->shape()[-3]; int blocks = std::min(MAX_BLOCKS, (int)totalRows); int threads = std::min(MAX_THREADS, (int)modelDim); int shared = sizeof(float) * threads; if(out->type() == Type::float32) { gAtt<<>>( out->data(), va->data(), context->data(), state->data(), totalRows, modelDim, batchDim, contextWordsDim); #if COMPILE_FP16 } else if (out->type() == Type::float16) { gAtt<<>>( out->data(), va->data(), context->data(), state->data(), totalRows, modelDim, batchDim, contextWordsDim); #endif } else { ABORT("gAtt not implemented for type {}", out->type()); } } template __global__ void gAttBack(T* gVa, T* gContext, T* gState, const T* va, const T* context, const T* state, const T* adj, int m, // rows int k, // cols int n // batch size ) { int rows = m; int cols = k; for(int bid = 0; bid < m; bid += gridDim.x) { int j = bid + blockIdx.x; if(j < rows) { T* gcRow = gContext + j * cols; T* gsRow = gState + (j % n) * cols; const T* cRow = context + j * cols; const T* sRow = state + (j % n) * cols; for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if(id < cols) { T z = cRow[id] + sRow[id]; T t = functional::Ops::tanh(z); T r = va[id] * ((T)1.f - t * t); gcRow[id] += r * adj[j]; // atomicAdd? reasons for instabilities? gsRow[id] += r * adj[j]; atomics::atomicAdd(gVa + id, t * adj[j]); // @TODO: get rid of atomicAdd via Matmul } } } } } void AttBack(Tensor gVa, Tensor gContext, Tensor gState, Tensor va, Tensor context, Tensor state, Tensor adj) { cudaSetDevice(adj->getDeviceId().no); size_t m = adj->shape().elements() / adj->shape()[-1]; size_t k = context->shape()[-1]; size_t n = context->shape()[-2]; int blocks = std::min(MAX_BLOCKS, (int)n); int threads = std::min(MAX_THREADS, (int)k); if(gVa->type() == Type::float32) { gAttBack<<>>(gVa->data(), gContext->data(), gState->data(), va->data(), context->data(), state->data(), adj->data(), m, k, n); #if COMPILE_FP16 } else if (gVa->type() == Type::float16) { gAttBack<<>>(gVa->data(), gContext->data(), gState->data(), va->data(), context->data(), state->data(), adj->data(), m, k, n); #endif } else { ABORT("gAttBack not implemented for type {}", gVa->type()); } } template __global__ void gLNormalization(T* out, const T* in, const T* gamma, const T* beta, int rows, int cols, AccType eps = 1e-9) { extern __shared__ uint8_t _sharedBytes[]; AccType* _shareAccType = (AccType*)_sharedBytes; AccType N = cols; for(int bid = 0; bid < rows; bid += gridDim.x) { int j = bid + blockIdx.x; if(j < rows) { T* yRow = out + j * cols; const T* xRow = in + j * cols; AccType* _sum = _shareAccType; // accumulate into floats _sum[threadIdx.x] = (AccType)0.0f; for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if(id < cols) { _sum[threadIdx.x] += (AccType)xRow[id]; } } __syncthreads(); int len = blockDim.x; while(len != 1) { __syncthreads(); int skip = (len + 1) >> 1; if(threadIdx.x < (len >> 1)) { _sum[threadIdx.x] += _sum[threadIdx.x + skip]; } len = (len + 1) >> 1; } __syncthreads(); AccType mean = _sum[0] / N; __syncthreads(); AccType* _sqSum = _shareAccType; _sqSum[threadIdx.x] = (AccType)0.0f; for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if(id < cols) { AccType xv = (AccType)xRow[id]; AccType ex = xv - mean; _sqSum[threadIdx.x] += ex * ex; } } __syncthreads(); len = blockDim.x; while(len != 1) { __syncthreads(); int skip = (len + 1) >> 1; if(threadIdx.x < (len >> 1)) _sqSum[threadIdx.x] += _sqSum[threadIdx.x + skip]; len = (len + 1) >> 1; } __syncthreads(); AccType sigma = functional::Ops::sqrt(_sqSum[0] / N + eps); // all AccType __syncthreads(); for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if(id < cols) { AccType gammav = (AccType)gamma[id]; AccType xv = (AccType)xRow[id]; AccType betav = beta ? (AccType)beta[id] : (AccType)0.f; AccType lv = (xv - mean) / sigma; AccType y = gammav * lv + betav; yRow[id] = (T)y; } } } __syncthreads(); } } void LayerNormalization(Tensor out, Tensor in, Tensor gamma, Tensor beta, float eps) { cudaSetDevice(out->getDeviceId().no); int rows = in->shape().elements() / in->shape().back(); int cols = in->shape().back(); int blocks = std::min(MAX_BLOCKS, (int)rows); int threads = std::min(MAX_THREADS, (int)cols); int shared = threads * sizeof(float); if(out->type() == Type::float32) { gLNormalization<<>>(out->data(), in->data(), gamma->data(), beta ? beta->data() : nullptr, rows, cols, eps); #if COMPILE_FP16 } else if (out->type() == Type::float16) { gLNormalization<<>>(out->data(), in->data(), gamma->data(), beta ? beta->data() : nullptr, rows, cols, eps); #endif } else { ABORT("LayerNormalization not implemented for type {}", out->type()); } } template __global__ void gLayerNormalizationGrad(T* gradX, T* gradGamma, T* adj, T* y, T* x, T* gamma, T* beta, int rows, int cols, AccType eps = 1e-9) { extern __shared__ uint8_t sharedBytes[]; AccType* shared = (AccType*)sharedBytes; AccType N = cols; for(int bid = 0; bid < rows; bid += gridDim.x) { int j = bid + blockIdx.x; if(j < rows) { AccType* sum_adj = shared; // sum of gradient coming in AccType* sum_adj_l = shared + blockDim.x; // sum of gradient coming in times layerNorm from value AccType* sum_x = shared + 2 * blockDim.x; // sum of input value x AccType* sum_sqr = shared + 3 * blockDim.x; // sum of (x - mean)^2 const T* xRow = x + j * cols; const T* yRow = y + j * cols; const T* adjRow = adj + j * cols; sum_x[threadIdx.x] = (AccType)0.0f; sum_adj[threadIdx.x] = (AccType)0.0f; sum_adj_l[threadIdx.x] = (AccType)0.0f; sum_sqr[threadIdx.x] = (AccType)0.0f; for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if(id < cols) { AccType xv = xRow[id]; AccType yv = yRow[id]; AccType betav = beta ? (AccType)beta[id] : (AccType)0.f; AccType gammav = (AccType)gamma[id]; AccType adjv = adjRow[id]; AccType lv = (yv - betav) / gammav; // go back to LN(x) from scaled and shifted version for accumulation sum_x[threadIdx.x] += xv; sum_adj_l[threadIdx.x] += adjv * lv; sum_adj[threadIdx.x] += adjv; } } __syncthreads(); int len = blockDim.x; while(len != 1) { __syncthreads(); int skip = (len + 1) >> 1; if(threadIdx.x < (len >> 1)) { sum_x[threadIdx.x] += sum_x[threadIdx.x + skip]; // Accumulates in AccType sum_adj[threadIdx.x] += sum_adj[threadIdx.x + skip]; // Accumulates in AccType sum_adj_l[threadIdx.x] += sum_adj_l[threadIdx.x + skip]; // Accumulates in AccType } len = (len + 1) >> 1; } __syncthreads(); AccType mean = sum_x[0] / N; __syncthreads(); for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if(id < cols) { AccType xv = xRow[id]; AccType ex = xv - mean; sum_sqr[threadIdx.x] += ex * ex; } } __syncthreads(); len = blockDim.x; while(len != 1) { __syncthreads(); int skip = (len + 1) >> 1; if(threadIdx.x < (len >> 1)) sum_sqr[threadIdx.x] += sum_sqr[threadIdx.x + skip]; // Accumulates in AccType len = (len + 1) >> 1; } __syncthreads(); AccType sigma = functional::Ops::sqrt(sum_sqr[0] / N + eps); __syncthreads(); // Jacobian of layer norm // J = [ \frac{1}{N\sigma} (N\delta_{ij} - l_i l_j - 1) ]_{ij} // J * a = dC/dx_i = ( N a_i - l_i \sum_j l_j a_j - \sum_j a_j ) / (N \sigma) for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if(id < cols) { AccType xv = xRow[id]; AccType gammav = (AccType)gamma[id]; AccType adjv = adjRow[id]; AccType lv = (xv - mean) / sigma; AccType gradLv = N * adjv - lv * sum_adj_l[0] - sum_adj[0]; gradLv /= N * sigma; AccType gradXv = gammav * gradLv; // Keep LN gradient between [-1000, 1000] for TensorOps, this currently used for making values fit into fp16. This wil also clip inf. // @TODO: to be fixed and removed. AccType sign = functional::Ops::sgn(gradXv); AccType cutoff = (AccType)1000.f; // @TODO: expose this somehow as an option? or better: make obsolete. gradXv = functional::Ops::abs(gradXv) > cutoff ? sign * cutoff : gradXv; // if gradXv is NaN the value return is NaN too because NaN > value is false. // @TODO: frankly, this is embarrasing and should rather be removed or optional? It does help for low precision computation though. Maybe turn into option? gradXv = isnan(gradXv) ? 0.f : gradXv; // turn NaN into 0. T* gradXRow = gradX + j * cols; gradXRow[id] += (T)(gradXv); T* gradGammaRow = gradGamma + j * cols; // assignment is correct here as this gets summed up // in the next kernel via matrix product gradGammaRow[id] = (T)(adjv * lv); } } } __syncthreads(); } } void LayerNormalizationGrad(Ptr allocator, Tensor gradX, Tensor gradGamma, Tensor gradBeta, Tensor adj, Tensor y, Tensor x, Tensor gamma, Tensor beta, float eps) { cudaSetDevice(adj->getDeviceId().no); int rows = y->shape().elements() / y->shape()[-1]; int cols = y->shape()[-1]; int threads = std::min(MAX_THREADS, cols); int blocks = std::min(MAX_BLOCKS, rows); auto tempGradGammaMemory = allocator->alloc(adj->memory()->size()); Tensor tempGradGamma = TensorBase::New(tempGradGammaMemory, adj->shape(), adj->type(), adj->getBackend()); tempGradGamma->set(0.f); auto tempOnesMemory = allocator->alloc(rows * sizeOf(adj->type())); Tensor tempOnes = TensorBase::New(tempOnesMemory, Shape({1, rows}), adj->type(), adj->getBackend()); tempOnes->set(1.f); if(gradX->type() == Type::float32) { int shared = sizeof(float) * threads * 4; gLayerNormalizationGrad<<>>( gradX->data(), tempGradGamma->data(), adj->data(), y->data(), x->data(), gamma->data(), (beta) ? beta->data() : nullptr, rows, cols, eps); #if COMPILE_FP16 } else if (gradX->type() == Type::float16) { // accumulate in float int shared = sizeof(float) * threads * 4; gLayerNormalizationGrad<<>>( gradX->data(), tempGradGamma->data(), adj->data(), y->data(), x->data(), gamma->data(), (beta) ? beta->data() : nullptr, rows, cols, eps); #endif } else { ABORT("LayerNormalizationGrad not implemented for type {}", gradX->type()); } // We use this go get rid of the atomicAdd and perform a reduce of the gradients afterwards. // This is much faster for fp16 which seems to have a broken atomicAdd implementation. // We reduce bias gradients with a matrix multiply, but use a 32-bit compute type. // This preserves precision with larger batches where all batch entries reduce into a single vector. // See also AffineNodeOp where we do the same for biases gpu::Prod(gradGamma, tempOnes, tempGradGamma, false, false, 1, 1, Type::float32); // beta set to one to add if(gradBeta) // dC/dbeta = adj - inverse broadcasting (reduction) gpu::Prod(gradBeta, tempOnes, adj, false, false, 1, 1, Type::float32); // beta set to one to add allocator->free(tempGradGammaMemory); allocator->free(tempOnesMemory); } template __global__ void gRMSNormalization(T* out, const T* in, const T* gamma, const T* beta, int rows, int cols, AccType eps = 1e-9) { extern __shared__ uint8_t _sharedBytes[]; AccType* _shareAccType = (AccType*)_sharedBytes; AccType N = cols; for(int bid = 0; bid < rows; bid += gridDim.x) { int j = bid + blockIdx.x; if(j < rows) { T* yRow = out + j * cols; const T* xRow = in + j * cols; AccType* _sqSum = _shareAccType; _sqSum[threadIdx.x] = (AccType)0.0f; for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if(id < cols) { AccType xv = (AccType)xRow[id]; _sqSum[threadIdx.x] += xv * xv; } } __syncthreads(); int len = blockDim.x; while(len != 1) { __syncthreads(); int skip = (len + 1) >> 1; if(threadIdx.x < (len >> 1)) _sqSum[threadIdx.x] += _sqSum[threadIdx.x + skip]; len = (len + 1) >> 1; } __syncthreads(); AccType rms = functional::Ops::sqrt(_sqSum[0] / N + eps); // all AccType __syncthreads(); for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if(id < cols) { AccType gammav = (AccType)gamma[id]; AccType xv = (AccType)xRow[id]; AccType betav = beta ? (AccType)beta[id] : (AccType)0.f; AccType rmsNorm = xv / rms; AccType y = gammav * rmsNorm + betav; yRow[id] = (T)y; } } } __syncthreads(); } } void RMSNormalization(Tensor out, Tensor in, Tensor gamma, Tensor beta, float eps) { cudaSetDevice(out->getDeviceId().no); int rows = in->shape().elements() / in->shape().back(); int cols = in->shape().back(); int blocks = std::min(MAX_BLOCKS, (int)rows); int threads = std::min(MAX_THREADS, (int)cols); int shared = threads * sizeof(float); if(out->type() == Type::float32) { gRMSNormalization<<>>(out->data(), in->data(), gamma->data(), beta ? beta->data() : nullptr, rows, cols, eps); #if COMPILE_FP16 } else if (out->type() == Type::float16) { gRMSNormalization<<>>(out->data(), in->data(), gamma->data(), beta ? beta->data() : nullptr, rows, cols, eps); #endif } else { ABORT("RMSNormalization not implemented for type {}", out->type()); } } template __global__ void gRMSNormalizationGrad(T* gradX, T* gradGamma, T* adj, T* y, T* x, T* gamma, T* beta, int rows, int cols, AccType eps = 1e-9) { extern __shared__ uint8_t sharedBytes[]; AccType* shared = (AccType*)sharedBytes; AccType N = cols; for(int bid = 0; bid < rows; bid += gridDim.x) { int j = bid + blockIdx.x; if(j < rows) { AccType* sum_adj_r = shared; // sum of gradient coming in times layerNorm from value AccType* sum_sqr = shared + blockDim.x; // sum of x^2 const T* xRow = x + j * cols; const T* yRow = y + j * cols; const T* adjRow = adj + j * cols; sum_adj_r[threadIdx.x] = (AccType)0.0f; sum_sqr[threadIdx.x] = (AccType)0.0f; for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if(id < cols) { AccType xv = xRow[id]; AccType yv = yRow[id]; AccType betav = beta ? (AccType)beta[id] : (AccType)0.f; AccType gammav = (AccType)gamma[id]; AccType adjv = adjRow[id]; AccType rv = (yv - betav) / gammav; // go back to RMSNorm(x) from scaled and shifted version for accumulation sum_adj_r[threadIdx.x] += adjv * rv; sum_sqr[threadIdx.x] += xv * xv; } } __syncthreads(); int len = blockDim.x; while(len != 1) { __syncthreads(); int skip = (len + 1) >> 1; if(threadIdx.x < (len >> 1)) { sum_adj_r[threadIdx.x] += sum_adj_r[threadIdx.x + skip]; // Accumulates in AccType sum_sqr[threadIdx.x] += sum_sqr[threadIdx.x + skip]; // Accumulates in AccType } len = (len + 1) >> 1; } __syncthreads(); AccType rms = functional::Ops::sqrt(sum_sqr[0] / N + eps); __syncthreads(); // Jacobian of RMS norm // J = [ \frac{1}{N * rms} (N\delta_{ij} - RN_i RN_j) ]_{ij} // J * a = dC/dx_i = ( N a_i - RN_i \sum_j RN_j a_j ) / (N * rms) for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if(id < cols) { AccType xv = xRow[id]; AccType gammav = (AccType)gamma[id]; AccType adjv = adjRow[id]; AccType rmsNorm = xv / rms; AccType gradNorm = N * adjv - rmsNorm * sum_adj_r[0]; gradNorm /= N * rms; AccType gradXv = gammav * gradNorm; // Keep RMSN gradient between [-1000, 1000] for TensorOps, this currently used for making values fit into fp16. This wil also clip inf. // @TODO: to be fixed and removed. AccType sign = functional::Ops::sgn(gradXv); AccType cutoff = (AccType)1000.f; // @TODO: expose this somehow as an option? or better: make obsolete. gradXv = functional::Ops::abs(gradXv) > cutoff ? sign * cutoff : gradXv; // if gradXv is NaN the value return is NaN too because NaN > value is false. // @TODO: frankly, this is embarrasing and should rather be removed or optional? It does help for low precision computation though. Maybe turn into option? gradXv = isnan(gradXv) ? 0.f : gradXv; // turn NaN into 0. T* gradXRow = gradX + j * cols; gradXRow[id] += (T)(gradXv); T* gradGammaRow = gradGamma + j * cols; // assignment is correct here as this gets summed up // in the next kernel via matrix product gradGammaRow[id] = (T)(adjv * rmsNorm); } } } __syncthreads(); } } void RMSNormalizationGrad(Ptr allocator, Tensor gradX, Tensor gradGamma, Tensor gradBeta, Tensor adj, Tensor y, Tensor x, Tensor gamma, Tensor beta, float eps) { cudaSetDevice(adj->getDeviceId().no); int rows = y->shape().elements() / y->shape()[-1]; int cols = y->shape()[-1]; int threads = std::min(MAX_THREADS, cols); int blocks = std::min(MAX_BLOCKS, rows); auto tempGradGammaMemory = allocator->alloc(adj->memory()->size()); Tensor tempGradGamma = TensorBase::New(tempGradGammaMemory, adj->shape(), adj->type(), adj->getBackend()); tempGradGamma->set(0.f); auto tempOnesMemory = allocator->alloc(rows * sizeOf(adj->type())); Tensor tempOnes = TensorBase::New(tempOnesMemory, Shape({1, rows}), adj->type(), adj->getBackend()); tempOnes->set(1.f); if(gradX->type() == Type::float32) { int shared = sizeof(float) * threads * 2; gRMSNormalizationGrad<<>>( gradX->data(), tempGradGamma->data(), adj->data(), y->data(), x->data(), gamma->data(), (beta) ? beta->data() : nullptr, rows, cols, eps); #if COMPILE_FP16 } else if (gradX->type() == Type::float16) { // accumulate in float int shared = sizeof(float) * threads * 2; gRMSNormalizationGrad<<>>( gradX->data(), tempGradGamma->data(), adj->data(), y->data(), x->data(), gamma->data(), (beta) ? beta->data() : nullptr, rows, cols, eps); #endif } else { ABORT("RMSNormalizationGrad not implemented for type {}", gradX->type()); } // We use this go get rid of the atomicAdd and perform a reduce of the gradients afterwards. // This is much faster for fp16 which seems to have a broken atomicAdd implementation. // We reduce bias gradients with a matrix multiply, but use a 32-bit compute type. // This preserves precision with larger batches where all batch entries reduce into a single vector. // See also AffineNodeOp where we do the same for biases gpu::Prod(gradGamma, tempOnes, tempGradGamma, false, false, 1, 1, Type::float32); // beta set to one to add if(gradBeta) // dC/dbeta = adj - inverse broadcasting (reduction) gpu::Prod(gradBeta, tempOnes, adj, false, false, 1, 1, Type::float32); // beta set to one to add allocator->free(tempGradGammaMemory); allocator->free(tempOnesMemory); } template __global__ void gShift(T* out, const T* in, int length, int offset, float padValue) { for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) { int index = bid + blockDim.x * blockIdx.x + threadIdx.x; if(index < length) { if(add) { if(index - offset >= 0 && index - offset < length) out[index] += in[index - offset]; } else { if(index - offset < 0 || index - offset >= length) out[index] = (T)padValue; else out[index] = in[index - offset]; } } } } void Shift(Tensor out, Tensor in, marian::Shape shift, float padValue, bool invert) { ABORT_IF(in->shape().size() != shift.size(), "bad dimensions"); // BUGBUG: This can only shift along the first axis. Shifting, e.g., along the // last axis cannot be implemented this way. int offset = 0; for(int i = 0; i < shift.size(); ++i) offset += in->shape().stride(i) * shift[i]; if(invert) offset = -offset; cudaSetDevice(out->getDeviceId().no); int length = out->shape().elements(); int threads = std::min(MAX_THREADS, length); int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); if(out->type() == Type::float32) { gShift <<>>(out->data(), in->data(), length, offset, padValue); #if COMPILE_FP16 } else if(out->type() == Type::float16) { gShift <<>>(out->data(), in->data(), length, offset, padValue); #endif } else { ABORT("Shift not implemented for type {}", out->type()); } } void ShiftGrad(Tensor out, Tensor in, marian::Shape shift, bool invert) { ABORT_IF(in->shape().size() != shift.size(), "bad dimensions"); // BUGBUG: This can only shift along the first axis. Shifting, e.g., along the // last axis cannot be implemented this way. int offset = 0; for(int i = 0; i < shift.size(); ++i) offset += in->shape().stride(i) * shift[i]; if(invert) offset = -offset; cudaSetDevice(out->getDeviceId().no); int length = out->shape().elements(); int threads = std::min(MAX_THREADS, length); int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); if(out->type() == Type::float32) { gShift <<>>(out->data(), in->data(), length, offset, 0.f); // @TODO: What about padValue? #if COMPILE_FP16 } else if(out->type() == Type::float16) { gShift <<>>(out->data(), in->data(), length, offset, 0.f); #endif } else { ABORT("Shift not implemented for type {}", out->type()); } } __global__ void gSetSparse(float* out, const size_t* indices, const float* values, int length) { for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) { int index = bid + blockDim.x * blockIdx.x + threadIdx.x; if(index < length) { out[indices[index]] = values[index]; } } } void SetSparse(float* out, const std::vector& indices, const std::vector& values) { int length = indices.size(); int threads = std::min(MAX_THREADS, length); int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); size_t* d_indices; CUDA_CHECK(cudaMalloc(&d_indices, length * sizeof(size_t))); CUDA_CHECK(cudaMemcpy(d_indices, indices.data(), length * sizeof(size_t), cudaMemcpyHostToDevice)); float* d_values; CUDA_CHECK(cudaMalloc(&d_values, length * sizeof(float))); CUDA_CHECK(cudaMemcpy( d_values, values.data(), length * sizeof(float), cudaMemcpyHostToDevice)); gSetSparse<<>>(out, d_indices, d_values, length); cudaFree(d_indices); cudaFree(d_values); } /******************************************************************************/ template __global__ void gLSTMCellForward(T* out, const T* cell, const T* xW, const T* sU, const T* b, const T* mask, size_t rows, size_t cols) { for(int bid = 0; bid < rows; bid += gridDim.x) { int j = bid + blockIdx.x; if(j < rows) { T m = !mask || mask[j]; T* rowOut = out + j * cols; const T* rowCell = cell + j * cols; const T* xWrow = xW + j * cols * 4; const T* sUrow = sU + j * cols * 4; for(int tid = 0; tid < cols; tid += blockDim.x) { int i = tid + threadIdx.x; if(i < cols) { T gf = functional::Ops::sigmoid(xWrow[i] + sUrow[i] + b[i]); int k = i + cols; T gi = functional::Ops::sigmoid(xWrow[k] + sUrow[k] + b[k]); int l = i + 2 * cols; T gc = functional::Ops::tanh(xWrow[l] + sUrow[l] + b[l]); T cout = gf * rowCell[i] + gi * gc; rowOut[i] = m * cout + ((T)1.f - m) * rowCell[i]; } } } } } void LSTMCellForward(Tensor out, std::vector inputs) { cudaSetDevice(out->getDeviceId().no); int rows = out->shape().elements() / out->shape().back(); int cols = out->shape().back(); int blocks = std::min(MAX_BLOCKS, rows); int threads = std::min(MAX_THREADS, cols); if(out->type() == Type::float32) { gLSTMCellForward<<>>( out->data(), // output inputs[0]->data(), // cell state inputs[1]->data(), // xW inputs[2]->data(), // sU inputs[3]->data(), // b inputs.size() > 4 ? inputs[4]->data() : 0, // mask rows, cols); #if COMPILE_FP16 } else if (out->type() == Type::float16) { gLSTMCellForward<<>>( out->data(), // output inputs[0]->data(), // cell state inputs[1]->data(), // xW inputs[2]->data(), // sU inputs[3]->data(), // b inputs.size() > 4 ? inputs[4]->data() : 0, // mask rows, cols); #endif } else { ABORT("LSTMCellForward not implemented for type {}", out->type()); } } template __global__ void gLSTMOutputForward(T* out, const T* cell, const T* xW, const T* sU, const T* b, size_t rows, size_t cols) { for(int bid = 0; bid < rows; bid += gridDim.x) { int j = bid + blockIdx.x; if(j < rows) { T* rowOut = out + j * cols; const T* rowCell = cell + j * cols; const T* xWrow = xW + j * cols * 4; const T* sUrow = sU + j * cols * 4; for(int tid = 0; tid < cols; tid += blockDim.x) { int i = tid + threadIdx.x; if(i < cols) { int k = i + 3 * cols; T go = functional::Ops::sigmoid(xWrow[k] + sUrow[k] + b[k]); rowOut[i] = go * functional::Ops::tanh(rowCell[i]); } } } } } void LSTMOutputForward(Tensor out, std::vector inputs) { cudaSetDevice(out->getDeviceId().no); int rows = out->shape().elements() / out->shape().back(); int cols = out->shape().back(); int blocks = std::min(MAX_BLOCKS, rows); int threads = std::min(MAX_THREADS, cols); if(out->type() == Type::float32) { gLSTMOutputForward<<>>(out->data(), // output inputs[0]->data(), // cell state inputs[1]->data(), // xW inputs[2]->data(), // sU inputs[3]->data(), // b rows, cols); #if COMPILE_FP16 } else if (out->type() == Type::float16) { gLSTMOutputForward<<>>(out->data(), // output inputs[0]->data(), // cell state inputs[1]->data(), // xW inputs[2]->data(), // sU inputs[3]->data(), // b rows, cols); #endif } else { ABORT("gLSTMOutputForward not implemented for type {}", out->type()); } } template __global__ void gLSTMCellBackward(T* outCell, T* outXW, T* outSU, T* outB, const T* cell, const T* xW, const T* sU, const T* b, const T* mask, const T* adj, size_t rows, size_t cols) { for(int bid = 0; bid < rows; bid += gridDim.x) { int j = bid + blockIdx.x; if(j < rows) { T m = !mask || mask[j]; T* rowOutCell = outCell + j * cols; T* rowOutXW = outXW + j * cols * 4; T* rowOutSU = outSU + j * cols * 4; const T* rowCell = cell + j * cols; const T* xWrow = xW + j * cols * 4; const T* sUrow = sU + j * cols * 4; const T* rowAdj = adj + j * cols; for(int tid = 0; tid < cols; tid += blockDim.x) { int i = tid + threadIdx.x; if(i < cols) { T gf = functional::Ops::sigmoid(xWrow[i] + sUrow[i] + b[i]); int k = i + cols; T gi = functional::Ops::sigmoid(xWrow[k] + sUrow[k] + b[k]); int l = i + 2 * cols; T gc = functional::Ops::tanh(xWrow[l] + sUrow[l] + b[l]); T adj = rowAdj[i]; // dc/dc_{t-1} if(outCell) rowOutCell[i] += (m * gf - m + (T)1.f) * adj; // dc/d(b_f) = dc/d(xW_f) ... T dcdxf = m * rowCell[i] * gf * ((T)1.f - gf) * adj; if(outXW) rowOutXW[i] += dcdxf; if(outSU) rowOutSU[i] += dcdxf; if(outB) atomics::atomicAdd(outB + i, dcdxf); // @TODO: get rid of atomicAdd everywhere! // dc/d(b_i) ... T dcdb_i = m * gc * gi * ((T)1.f - gi) * adj; if(outXW) rowOutXW[k] += dcdb_i; if(outSU) rowOutSU[k] += dcdb_i; if(outB) atomics::atomicAdd(outB + k, dcdb_i); // dc/d(b_c) ... T dcdxc = m * gi * ((T)1.f - gc * gc) * adj; if(outXW) rowOutXW[l] += dcdxc; if(outSU) rowOutSU[l] += dcdxc; if(outB) atomics::atomicAdd(outB + l, dcdxc); } } } } } void LSTMCellBackward(std::vector outputs, std::vector inputs, Tensor adj) { cudaSetDevice(adj->getDeviceId().no); int rows = adj->shape().elements() / adj->shape().back(); int cols = adj->shape().back(); int blocks = std::min(MAX_BLOCKS, rows); int threads = std::min(MAX_THREADS, cols); if(adj->type() == Type::float32) { gLSTMCellBackward<<>>( outputs[0] ? outputs[0]->data() : 0, // state - adj outputs[1] ? outputs[1]->data() : 0, // xW - adj outputs[2] ? outputs[2]->data() : 0, // sU - adj outputs[3] ? outputs[3]->data() : 0, // b - adj inputs[0]->data(), // state inputs[1]->data(), // xW inputs[2]->data(), // sU inputs[3]->data(), // b inputs.size() > 4 ? inputs[4]->data() : 0, // mask adj->data(), rows, cols); #if COMPILE_FP16 } else if (adj->type() == Type::float16) { gLSTMCellBackward<<>>( outputs[0] ? outputs[0]->data() : 0, // state - adj outputs[1] ? outputs[1]->data() : 0, // xW - adj outputs[2] ? outputs[2]->data() : 0, // sU - adj outputs[3] ? outputs[3]->data() : 0, // b - adj inputs[0]->data(), // state inputs[1]->data(), // xW inputs[2]->data(), // sU inputs[3]->data(), // b inputs.size() > 4 ? inputs[4]->data() : 0, // mask adj->data(), rows, cols); #endif } else { ABORT("gLSTMCellBackward not implemented for type {}", adj->type()); } } template __global__ void gLSTMOutputBackward(T* outCell, T* outXW, T* outSU, T* outB, const T* cell, const T* xW, const T* sU, const T* b, const T* adj, size_t rows, size_t cols) { for(int bid = 0; bid < rows; bid += gridDim.x) { int j = bid + blockIdx.x; if(j < rows) { T* rowOutCell = outCell + j * cols; T* rowOutXW = outXW + j * cols * 4; T* rowOutSU = outSU + j * cols * 4; const T* rowCell = cell + j * cols; const T* xWrow = xW + j * cols * 4; const T* sUrow = sU + j * cols * 4; const T* rowAdj = adj + j * cols; for(int tid = 0; tid < cols; tid += blockDim.x) { int i = tid + threadIdx.x; if(i < cols) { int k = i + 3 * cols; T go = functional::Ops::sigmoid(xWrow[k] + sUrow[k] + b[k]); T t = functional::Ops::tanh(rowCell[i]); T adj = rowAdj[i]; // dc/dc_{t-1} if(outCell) rowOutCell[i] += go * ((T)1.f - t * t) * adj; // dc/d(b_o) = dc/d(xW_f) ... float dcdxo = t * go * ((T)1.f - go) * adj; if(outXW) rowOutXW[k] += dcdxo; if(outSU) rowOutSU[k] += dcdxo; if(outB) atomics::atomicAdd(outB + k, dcdxo); // @TODO: get rid of atomicAdd } } } } } void LSTMOutputBackward(std::vector outputs, std::vector inputs, Tensor adj) { cudaSetDevice(adj->getDeviceId().no); int rows = adj->shape().elements() / adj->shape().back(); int cols = adj->shape().back(); int blocks = std::min(MAX_BLOCKS, rows); int threads = std::min(MAX_THREADS, cols); if(adj->type() == Type::float32) { gLSTMOutputBackward<<>>( outputs[0] ? outputs[0]->data() : 0, // state - adj outputs[1] ? outputs[1]->data() : 0, // xW - adj outputs[2] ? outputs[2]->data() : 0, // sU - adj outputs[3] ? outputs[3]->data() : 0, // b - adj inputs[0]->data(), // state inputs[1]->data(), // xW inputs[2]->data(), // sU inputs[3]->data(), // b adj->data(), rows, cols); #if COMPILE_FP16 } else if (adj->type() == Type::float16) { gLSTMOutputBackward<<>>( outputs[0] ? outputs[0]->data() : 0, // state - adj outputs[1] ? outputs[1]->data() : 0, // xW - adj outputs[2] ? outputs[2]->data() : 0, // sU - adj outputs[3] ? outputs[3]->data() : 0, // b - adj inputs[0]->data(), // state inputs[1]->data(), // xW inputs[2]->data(), // sU inputs[3]->data(), // b adj->data(), rows, cols); #endif } else { ABORT("gLSTMOutputBackward not implemented for type {}", adj->type()); } } template __global__ void gHighwayForward(T* out, const T* in1, const T* in2, const T* t, size_t length) { for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) { int index = bid + blockDim.x * blockIdx.x + threadIdx.x; if(index < length) { T sigma = functional::Ops::sigmoid(t[index]); out[index] = in1[index] * sigma + in2[index] * ((T)1.f - sigma); } } } void HighwayForward(Tensor out, const Tensor in1, const Tensor in2, const Tensor t) { cudaSetDevice(out->getDeviceId().no); int length = out->shape().elements(); int threads = std::min(MAX_THREADS, length); int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); if(out->type() == Type::float32) { gHighwayForward<<>>( out->data(), in1->data(), in2->data(), t->data(), length); #if COMPILE_FP16 } else if(out->type() == Type::float16) { gHighwayForward<<>>( out->data(), in1->data(), in2->data(), t->data(), length); #endif } else { ABORT("HighwayForward not implemented for type {}", out->type()); } } template __global__ void gHighwayBackward(T* out1, T* out2, T* outt, const T* in1, const T* in2, const T* t, const T* adj, size_t length) { for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) { int index = bid + blockDim.x * blockIdx.x + threadIdx.x; if(index < length) { T sigma = functional::Ops::sigmoid(t[index]); out1[index] = sigma * adj[index]; out2[index] = ((T)1.f - sigma) * adj[index]; outt[index] = sigma * ((T)1.f - sigma) * (in1[index] - in2[index]) * adj[index]; } } } void HighwayBackward(Tensor out1, Tensor out2, Tensor outt, const Tensor in1, const Tensor in2, const Tensor t, const Tensor adj) { cudaSetDevice(out1->getDeviceId().no); int length = out1->shape().elements(); int threads = std::min(MAX_THREADS, length); int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); if(out1->type() == Type::float32) { gHighwayBackward<<>>(out1->data(), out2->data(), outt->data(), in1->data(), in2->data(), t->data(), adj->data(), length); #if COMPILE_FP16 } else if(out1->type() == Type::float16) { gHighwayBackward<<>>(out1->data(), out2->data(), outt->data(), in1->data(), in2->data(), t->data(), adj->data(), length); #endif } else { ABORT("HighwayForward not implemented for type {}", out1->type()); } } __global__ void gMaxPoolingForward(float* out, int outRows, int outCols, float* in, int inRows, int inCols, float* mask, int numKernels, int maskCols, int width, int lastWidth) { int tid = threadIdx.x + blockIdx.x * blockDim.x; if(tid >= outRows * outCols) return; int rowId = tid / outRows; int colId = tid % outRows; float* b = in + (rowId * inCols) + (colId * width); float* localMask = mask + (rowId / numKernels) * maskCols + colId * width; if(colId == outRows - 1) { width = lastWidth; } float currentMax = b[0] * localMask[0]; for(int i = 1; i < width; ++i) { if(b[i] * localMask[i] > currentMax) { currentMax = b[i] * localMask[i]; } } out[rowId + (colId * outCols)] = currentMax; } void PoolingWithMaskingForward(Tensor out, Tensor in, Tensor mask, int width, bool isEven) { matchOrAbort(out->type()); int n = out->shape().elements(); int threads = std::min(n, MAX_THREADS); int blocks = n / threads + (n % threads != 0); auto& inShape = in->shape(); int inRows = inShape[0] * inShape[1]; int inCols = inShape[2]; auto& outShape = out->shape(); int outRows = outShape[2]; int outCols = outShape[0] * outShape[1]; int lastWidth = ((inCols - isEven) % width == 0) ? width : (inCols - isEven) % width; gMaxPoolingForward<<>>(out->data(), outRows, outCols, in->data(), inRows, inCols, mask->data(), outShape[1], mask->shape()[2], width, lastWidth); } __global__ void gMaxPoolingBackward(float* adj, int adjRows, int adjCols, float* in, float* adjIn, int inRows, int inCols, float* mask, int numKernels, int maskCols, int width, int lastWidth) { int tid = threadIdx.x + blockIdx.x * blockDim.x; if(tid >= adjRows * adjCols) return; int rowId = tid / adjRows; int colId = tid % adjRows; float* b = in + (rowId * inCols) + (colId * width); if(colId == adjRows - 1) { width = lastWidth; } float* localMask = mask + (rowId / numKernels) * maskCols + colId * width; size_t currentMaxIdx = 0; for(int i = 1; i < width; ++i) { if(b[i] * localMask[i] > b[currentMaxIdx] * localMask[currentMaxIdx]) { currentMaxIdx = i; } } adjIn[(rowId * inCols) + (colId * width) + currentMaxIdx] += adj[rowId + (colId * adjCols)]; } void PoolingWithMaskingBackward(Tensor adj, Tensor adjIn, Tensor in, Tensor mask, int width, bool isEven) { matchOrAbort(adj->type()); int n = adj->shape().elements(); int threads = std::min(n, 512); int blocks = n / threads + (n % threads != 0); auto& inShape = in->shape(); int inRows = inShape[0] * inShape[1]; int inCols = inShape[2]; auto& adjShape = adj->shape(); int adjRows = adjShape[2]; int adjCols = adjShape[0] * adjShape[1]; int lastWidth = ((inCols - isEven) % width == 0) ? width : (inCols - isEven) % width; gMaxPoolingBackward<<>>(adj->data(), adjRows, adjCols, in->data(), adjIn->data(), inRows, inCols, mask->data(), adjShape[1], mask->shape()[2], width, lastWidth); } } // namespace gpu } // namespace marian