.. _program_listing_file_src_tensors_tensor_operators.h: Program Listing for File tensor_operators.h =========================================== |exhale_lsh| :ref:`Return to documentation for file ` (``src/tensors/tensor_operators.h``) .. |exhale_lsh| unicode:: U+021B0 .. UPWARDS ARROW WITH TIP LEFTWARDS .. code-block:: cpp #pragma once #include "common/definitions.h" #include "tensors/allocator.h" #include "tensors/tensor.h" #include "tensors/dispatch.h" #include "functional/shape.h" #include "functional/tensor.h" #include "functional/tmp.h" #ifdef CUDA_FOUND #include "tensors/gpu/add.h" #include "tensors/gpu/algorithm.h" #include "tensors/gpu/element.h" #include "tensors/gpu/prod.h" #endif #include "tensors/cpu/add.h" #include "tensors/cpu/element.h" #include namespace marian { template void copy(Ptr& backend, const InIt beg, const InIt end, OutIt it) { #ifdef CUDA_FOUND if(backend->getDeviceId().type == DeviceType::gpu) gpu::copy(backend, beg, end, it); else std::copy(beg, end, it); #else backend; std::copy(beg, end, it); #endif } DISPATCH2(CopyCast, marian::Tensor, const marian::Tensor); DISPATCH2(AddCast, marian::Tensor, const marian::Tensor); DISPATCH4(IsNaN, const Tensor, Ptr, bool&, bool&); #ifdef CUDA_FOUND namespace gpu { bool SanitizeGradient(marian::Tensor in, Ptr allocator, bool pruneNaN, bool clipInf); } #endif namespace cpu { bool SanitizeGradient(marian::Tensor in, Ptr allocator, bool pruneNaN, bool clipInf); } static inline bool SanitizeGradient(marian::Tensor in, Ptr allocator, bool pruneNaN, bool clipInf) { #ifdef CUDA_FOUND if(in->getBackend()->getDeviceId().type == DeviceType::gpu) return gpu::SanitizeGradient(in, allocator, pruneNaN, clipInf); else #endif return cpu::SanitizeGradient(in, allocator, pruneNaN, clipInf); } template void Element(Functor functor, marian::Tensor out, Tensors... tensors) { #ifdef CUDA_FOUND if(out->getBackend()->getDeviceId().type == DeviceType::gpu) gpu::Element(functor, out, tensors...); else #endif cpu::Element(functor, out, tensors...); } template void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) { #ifdef CUDA_FOUND if(out->getBackend()->getDeviceId().type == DeviceType::gpu) gpu::Add(functor, scale, out, tensors...); else #endif cpu::Aggregate(functor, /*aggInit=*/0.0f, functional::_1 + functional::_2, scale, out, tensors...); } template void Add(Functor functor, marian::Tensor out, Tensors... tensors) { Add(functor, /*scale=*/1.f, out, tensors...); } template void Aggregate(Functor functor, float aggInit, AggFunctor aggFunctor, marian::Tensor out, Tensors... tensors) { #ifdef CUDA_FOUND if(out->getBackend()->getDeviceId().type == DeviceType::gpu) gpu::Aggregate(functor, aggInit, aggFunctor, 1.0f, out, tensors...); else #endif cpu::Aggregate(functor, aggInit, aggFunctor, 1.0f, out, tensors...); } template void Reduce(Functor functor, float scale, marian::Tensor out, Tensors... tensors) { out->set(0.f); Add(functor, scale, out, tensors...); } template void Reduce(Functor functor, marian::Tensor out, Tensors... tensors) { out->set(0.f); Add(functor, out, tensors...); } template void Reduce(Functor functor, AggFunctor aggFunctor, float aggInit, marian::Tensor out, Tensors... tensors) { out->set(aggInit); Aggregate(functor, aggInit, aggFunctor, out, tensors...); } // clang-format off DISPATCH7(Prod, marian::Tensor, const marian::Tensor&, const marian::Tensor&, bool, bool, float, float) DISPATCH8(Prod, marian::Tensor, const marian::Tensor&, const marian::Tensor&, bool, bool, float, float, Type) // overloading since we want the default to for computeType be C->type() which difficult otherwise. DISPATCH8(ProdBatched, marian::Tensor, Ptr, const marian::Tensor, const marian::Tensor, bool, bool, float, float) DISPATCH8(ProdBatchedLegacy, marian::Tensor, Ptr, const marian::Tensor, const marian::Tensor, bool, bool, float, float) DISPATCH9(CSRProd, marian::Tensor, Ptr, const marian::Tensor&, const marian::Tensor&, const marian::Tensor&, const marian::Tensor&, bool, bool, float) DISPATCH10(Affine, marian::Tensor, Ptr, const marian::Tensor&, const marian::Tensor&, const marian::Tensor&, bool, bool, float, float, bool) DISPATCH2(Softmax, marian::Tensor, marian::Tensor) DISPATCH3(SoftmaxGrad, marian::Tensor, marian::Tensor, marian::Tensor) DISPATCH2(LogSoftmax, marian::Tensor, marian::Tensor) DISPATCH3(LogSoftmaxGrad, marian::Tensor, marian::Tensor, marian::Tensor) DISPATCH4(CrossEntropyPick, marian::Tensor, marian::Tensor, marian::Tensor, float) DISPATCH5(CrossEntropyPickBackward, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, float) DISPATCH3(TransposeND, marian::Tensor, marian::Tensor, const std::vector&) DISPATCH3(TransposeNDGrad, marian::Tensor, marian::Tensor, const std::vector&) DISPATCH5(Shift, marian::Tensor, marian::Tensor, marian::Shape, float, bool) DISPATCH4(ShiftGrad, marian::Tensor, marian::Tensor, marian::Shape, bool) DISPATCH3(Concatenate, marian::Tensor, const std::vector&, int) // clang-format on // Bernoulli(tensor, 0.5f, 2.f, -1.f) generates a tensor composed of 50% of 1 and 50% of -1. static inline void Bernoulli(Tensor resultTensor, float keepProb, float scale = 1.f, float shift = 0.f) { // in-place uniform distribution auto rnd = resultTensor->getBackend()->getRandomGenerator(); rnd->uniform(resultTensor, 0.f, 1.f); // temporarily mis-use this to hold the random numbers using namespace functional; Element(_1 = (_1 < keepProb) * scale + shift, resultTensor); } static inline void Dropout(Tensor tensor, float dropProb) { float keepProb = 1.f - dropProb; float scale = 1.f / keepProb; Bernoulli(tensor, keepProb, scale, /*shift=*/0.f); } DISPATCH2(SinusoidalPositionEmbeddings, marian::Tensor, int); #ifdef CUDA_FOUND namespace gpu { void Deconcatenate(std::vector& outputs, const marian::Tensor in, int ax); } #endif namespace cpu { void Deconcatenate(std::vector& outputs, const marian::Tensor in, int ax); } static inline void Deconcatenate(std::vector& outputs, const marian::Tensor in, int ax) { #ifdef CUDA_FOUND if(in->getBackend()->getDeviceId().type == DeviceType::gpu) gpu::Deconcatenate(outputs, in, ax); else #endif cpu::Deconcatenate(outputs, in, ax); } // clang-format off DISPATCH5(LayerNormalization, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, float) #ifdef CUDA_FOUND namespace gpu { void LayerNormalizationGrad(Ptr allocator, Tensor gradX, Tensor gradGamma, Tensor gradBeta, Tensor adj, Tensor y, Tensor x, Tensor gamma, Tensor beta, float eps); } #endif namespace cpu { void LayerNormalizationGrad(Tensor gradX, Tensor gradGamma, Tensor gradBeta, Tensor adj, Tensor y, Tensor x, Tensor gamma, Tensor beta, float eps); } static inline void LayerNormalizationGrad( Ptr allocator, Tensor gradX, Tensor gradGamma, Tensor gradBeta, Tensor adj, Tensor y, Tensor x, Tensor gamma, Tensor beta, float eps) { #ifdef CUDA_FOUND if(gradX->getBackend()->getDeviceId().type == DeviceType::gpu) gpu::LayerNormalizationGrad(allocator, gradX, gradGamma, gradBeta, adj, y, x, gamma, beta, eps); else #endif cpu::LayerNormalizationGrad(gradX, gradGamma, gradBeta, adj, y, x, gamma, beta, eps); } // clang-format off DISPATCH5(RMSNormalization, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, float) #ifdef CUDA_FOUND namespace gpu { void RMSNormalizationGrad(Ptr allocator, Tensor gradX, Tensor gradGamma, Tensor gradBeta, Tensor adj, Tensor y, Tensor x, Tensor gamma, Tensor beta, float eps); } #endif namespace cpu { void RMSNormalizationGrad(Tensor gradX, Tensor gradGamma, Tensor gradBeta, Tensor adj, Tensor y, Tensor x, Tensor gamma, Tensor beta, float eps); } static inline void RMSNormalizationGrad( Ptr allocator, Tensor gradX, Tensor gradGamma, Tensor gradBeta, Tensor adj, Tensor y, Tensor x, Tensor gamma, Tensor beta, float eps) { #ifdef CUDA_FOUND if(gradX->getBackend()->getDeviceId().type == DeviceType::gpu) gpu::RMSNormalizationGrad(allocator, gradX, gradGamma, gradBeta, adj, y, x, gamma, beta, eps); else #endif cpu::RMSNormalizationGrad(gradX, gradGamma, gradBeta, adj, y, x, gamma, beta, eps); } DISPATCH4(HighwayForward, marian::Tensor, const marian::Tensor, const marian::Tensor, const marian::Tensor) DISPATCH7(HighwayBackward, marian::Tensor, marian::Tensor, marian::Tensor, const marian::Tensor, const marian::Tensor, const marian::Tensor, const marian::Tensor) DISPATCH3(CopyRows, marian::Tensor, const marian::Tensor, const marian::Tensor) DISPATCH3(PasteRows, marian::Tensor, const marian::Tensor, const marian::Tensor) DISPATCH3(CopyCols, marian::Tensor, const marian::Tensor, const marian::Tensor) DISPATCH3(PasteCols, marian::Tensor, const marian::Tensor, const marian::Tensor) DISPATCH4(Select, marian::Tensor, const marian::Tensor, const marian::Tensor, int) #ifdef CUDA_FOUND namespace gpu { template void Insert(Tensor out, const Tensor in, const Tensor indices, int axis); } #endif namespace cpu { template void Insert(Tensor out, const Tensor in, const Tensor indices, int axis); } template static inline void Insert(Tensor out, const Tensor in, const Tensor indices, int axis) { #ifdef CUDA_FOUND if(out->getBackend()->getDeviceId().type == DeviceType::gpu) gpu::Insert(out, in, indices, axis); else #endif cpu::Insert(out, in, indices, axis); } DISPATCH7(TopK, marian::Tensor, marian::Tensor, Ptr, const marian::Tensor, int, int, bool); DISPATCH2(LSTMCellForward, marian::Tensor, std::vector) DISPATCH2(LSTMOutputForward, marian::Tensor, std::vector); // clang-format on #ifdef CUDA_FOUND namespace gpu { void LSTMCellBackward(std::vector outputs, std::vector inputs, marian::Tensor adj); } #endif namespace cpu { void LSTMCellBackward(std::vector outputs, std::vector inputs, marian::Tensor adj); } static inline void LSTMCellBackward(std::vector outputs, std::vector inputs, marian::Tensor adj) { #ifdef CUDA_FOUND if(adj->getBackend()->getDeviceId().type == DeviceType::gpu) gpu::LSTMCellBackward(outputs, inputs, adj); else #endif cpu::LSTMCellBackward(outputs, inputs, adj); } #ifdef CUDA_FOUND namespace gpu { void LSTMOutputBackward(std::vector outputs, std::vector inputs, marian::Tensor adj); } #endif namespace cpu { void LSTMOutputBackward(std::vector outputs, std::vector inputs, marian::Tensor adj); } static inline void LSTMOutputBackward(std::vector outputs, std::vector inputs, marian::Tensor adj) { #ifdef CUDA_FOUND if(adj->getBackend()->getDeviceId().type == DeviceType::gpu) gpu::LSTMOutputBackward(outputs, inputs, adj); else #endif cpu::LSTMOutputBackward(outputs, inputs, adj); } DISPATCH3(GRUFastForward, marian::Tensor, std::vector, bool) #ifdef CUDA_FOUND namespace gpu { void GRUFastBackward(Ptr allocator, std::vector outputs, std::vector inputs, marian::Tensor adj, bool final); } #endif namespace cpu { void GRUFastBackward(Ptr allocator, std::vector outputs, std::vector inputs, marian::Tensor adj, bool final); } static inline void GRUFastBackward(Ptr allocator, std::vector outputs, std::vector inputs, marian::Tensor adj, bool final = false) { #ifdef CUDA_FOUND if(adj->getBackend()->getDeviceId().type == DeviceType::gpu) gpu::GRUFastBackward(allocator, outputs, inputs, adj, final); else #endif cpu::GRUFastBackward(allocator, outputs, inputs, adj, final); } // clang-format off DISPATCH4(Att, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor) DISPATCH7(AttBack, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor) // clang-format on #ifdef CUDA_FOUND namespace gpu { float L2Norm(marian::Tensor in, Ptr allocator); } #endif namespace cpu { float L2Norm(marian::Tensor in, Ptr allocator); } static inline float L2Norm(marian::Tensor in, Ptr allocator) { #ifdef CUDA_FOUND if(in->getBackend()->getDeviceId().type == DeviceType::gpu) return gpu::L2Norm(in, allocator); else #endif return cpu::L2Norm(in, allocator); } // clang-format off DISPATCH5(PoolingWithMaskingForward, marian::Tensor, marian::Tensor, marian::Tensor, int, bool) DISPATCH6(PoolingWithMaskingBackward, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, int, bool) // clang-format on } // namespace marian