.. _program_listing_file_src_tensors_gpu_algorithm.cu: Program Listing for File algorithm.cu ===================================== |exhale_lsh| :ref:`Return to documentation for file ` (``src/tensors/gpu/algorithm.cu``) .. |exhale_lsh| unicode:: U+021B0 .. UPWARDS ARROW WITH TIP LEFTWARDS .. code-block:: cpp #include "tensors/gpu/algorithm.h" // clang-format off #include "tensors/tensor_operators.h" #include "tensors/gpu/cuda_helpers.h" // clang-format on namespace marian { namespace gpu { template void copy(Ptr backend, const T* begin, const T* end, T* dest) { CUDA_CHECK(cudaSetDevice(backend->getDeviceId().no)); CudaCopy(begin, end, dest); CUDA_CHECK(cudaStreamSynchronize(0)); } // clang-format off template void copy(Ptr, const int8_t*, const int8_t*, int8_t*); template void copy(Ptr, const int16_t*, const int16_t*, int16_t*); template void copy(Ptr, const int32_t*, const int32_t*, int32_t*); template void copy(Ptr, const int64_t*, const int64_t*, int64_t*); template void copy(Ptr, const uint8_t*, const uint8_t*, uint8_t*); template void copy(Ptr, const uint16_t*, const uint16_t*, uint16_t*); template void copy(Ptr, const uint32_t*, const uint32_t*, uint32_t*); template void copy(Ptr, const uint64_t*, const uint64_t*, uint64_t*); template void copy(Ptr, const char*, const char*, char*); template void copy(Ptr, const float16*, const float16*, float16*); template void copy(Ptr, const float*, const float*, float*); template void copy(Ptr, const double*, const double*, double*); // clang-format on template __global__ void gFill(T* d_in, int size, T val) { //auto blocks = gridDim.x; auto threadsPerBlock = blockDim.x; //for(int bid = 0; bid < size; bid += threadsPerBlock * blocks) { int index = /*bid +*/ threadIdx.x + threadsPerBlock * blockIdx.x; if(index < size) { d_in[index] = val; } //} } template void fill(Ptr backend, T* begin, T* end, T value) { int size = end - begin; if (size == 0) return; CUDA_CHECK(cudaSetDevice(backend->getDeviceId().no)); int threadsPerBlock = std::min(MAX_THREADS, size); int blocks = (size / threadsPerBlock) + (size % threadsPerBlock != 0); // @TODO: (size+threadsPerBlock-1)/threadsPerBlock or CeilDiv(a,b) gFill<<>>(begin, size, value); CUDA_CHECK(cudaStreamSynchronize(0)); } template <> void fill(Ptr backend, float16* begin, float16* end, float16 value) { int size = end - begin; if (size == 0) return; #if COMPILE_FP16 CUDA_CHECK(cudaSetDevice(backend->getDeviceId().no)); int threadsPerBlock = std::min(MAX_THREADS, size); int blocks = (size / threadsPerBlock) + (size % threadsPerBlock != 0); // @TODO: (size+threadsPerBlock-1)/threadsPerBlock or CeilDiv(a,b) gFill<<>>((__half*)begin, size, (__half)value); CUDA_CHECK(cudaStreamSynchronize(0)); #else ABORT("FP16 not supported with current hardware or CUDA version"); #endif } template void fill(Ptr, bool*, bool*, bool); template void fill(Ptr, int8_t*, int8_t*, int8_t); template void fill(Ptr, int16_t*, int16_t*, int16_t); template void fill(Ptr, int32_t*, int32_t*, int32_t); template void fill(Ptr, int64_t*, int64_t*, int64_t); template void fill(Ptr, uint8_t*, uint8_t*, uint8_t); template void fill(Ptr, uint16_t*, uint16_t*, uint16_t); template void fill(Ptr, uint32_t*, uint32_t*, uint32_t); template void fill(Ptr, uint64_t*, uint64_t*, uint64_t); template void fill(Ptr, float*, float*, float); template void fill(Ptr, double*, double*, double); void setSparse(Ptr backend, const std::vector& keys, const std::vector& values, float* data) { CUDA_CHECK(cudaSetDevice(backend->getDeviceId().no)); ABORT("no SetSparse"); // gpu::SetSparse(data, keys, values); CUDA_CHECK(cudaStreamSynchronize(0)); } template __global__ void gSwap(T* d_v1, T* d_v2, int size) { auto threadsPerBlock = blockDim.x; int index = threadIdx.x + threadsPerBlock * blockIdx.x; if(index < size) { T temp = d_v1[index]; d_v1[index] = d_v2[index]; d_v2[index] = temp; } } template void swap_ranges(Ptr backend, T* begin, T* end, T* dest) { int size = end - begin; if (size == 0) return; CUDA_CHECK(cudaSetDevice(backend->getDeviceId().no)); int threadsPerBlock = std::min(MAX_THREADS, size); int blocks = (size / threadsPerBlock) + (size % threadsPerBlock != 0); // @TODO: (size+threadsPerBlock-1)/threadsPerBlock or CeilDiv(a,b) gSwap<<>>(begin, dest, size); CUDA_CHECK(cudaStreamSynchronize(0)); } template <> void swap_ranges(Ptr backend, float16* begin, float16* end, float16* dest) { int size = end - begin; if (size == 0) return; #if COMPILE_FP16 CUDA_CHECK(cudaSetDevice(backend->getDeviceId().no)); int threadsPerBlock = std::min(MAX_THREADS, size); int blocks = (size / threadsPerBlock) + (size % threadsPerBlock != 0); // @TODO: (size+threadsPerBlock-1)/threadsPerBlock or CeilDiv(a,b) gSwap<<>>((__half*)begin, (__half*)dest, size); CUDA_CHECK(cudaStreamSynchronize(0)); #else ABORT("FP16 not supported with current hardware or CUDA version"); #endif } // clang-format off template void swap_ranges(Ptr, char*, char*, char*); template void swap_ranges(Ptr, int8_t*, int8_t*, int8_t*); template void swap_ranges(Ptr, int16_t*, int16_t*, int16_t*); template void swap_ranges(Ptr, int32_t*, int32_t*, int32_t*); template void swap_ranges(Ptr, int64_t*, int64_t*, int64_t*); template void swap_ranges(Ptr, uint8_t*, uint8_t*, uint8_t*); template void swap_ranges(Ptr, uint16_t*, uint16_t*, uint16_t*); template void swap_ranges(Ptr, uint32_t*, uint32_t*, uint32_t*); template void swap_ranges(Ptr, uint64_t*, uint64_t*, uint64_t*); template void swap_ranges(Ptr, float*, float*, float*); template void swap_ranges(Ptr, double*, double*, double*); // clang-format on } // namespace gpu } // namespace marian