Program Listing for File algorithm.cu

Return to documentation for file (src/tensors/gpu/algorithm.cu)

#include "tensors/gpu/algorithm.h"

// clang-format off
#include "tensors/tensor_operators.h"
#include "tensors/gpu/cuda_helpers.h"
// clang-format on

namespace marian {
namespace gpu {

template <typename T>
void copy(Ptr<Backend> backend, const T* begin, const T* end, T* dest) {
  CUDA_CHECK(cudaSetDevice(backend->getDeviceId().no));
  CudaCopy(begin, end, dest);
  CUDA_CHECK(cudaStreamSynchronize(0));
}

// clang-format off
template void copy<int8_t>(Ptr<Backend>, const int8_t*, const int8_t*, int8_t*);
template void copy<int16_t>(Ptr<Backend>, const int16_t*, const int16_t*, int16_t*);
template void copy<int32_t>(Ptr<Backend>, const int32_t*, const int32_t*, int32_t*);
template void copy<int64_t>(Ptr<Backend>, const int64_t*, const int64_t*, int64_t*);
template void copy<uint8_t>(Ptr<Backend>, const uint8_t*, const uint8_t*, uint8_t*);
template void copy<uint16_t>(Ptr<Backend>, const uint16_t*, const uint16_t*, uint16_t*);
template void copy<uint32_t>(Ptr<Backend>, const uint32_t*, const uint32_t*, uint32_t*);
template void copy<uint64_t>(Ptr<Backend>, const uint64_t*, const uint64_t*, uint64_t*);
template void copy<char>(Ptr<Backend>, const char*, const char*, char*);
template void copy<float16>(Ptr<Backend>, const float16*, const float16*, float16*);
template void copy<float>(Ptr<Backend>, const float*, const float*, float*);
template void copy<double>(Ptr<Backend>, const double*, const double*, double*);
// clang-format on

template <typename T>
__global__ void gFill(T* d_in, int size, T val) {
  //auto blocks = gridDim.x;
  auto threadsPerBlock = blockDim.x;
  //for(int bid = 0; bid < size; bid += threadsPerBlock * blocks) {
    int index = /*bid +*/ threadIdx.x + threadsPerBlock * blockIdx.x;
    if(index < size) {
      d_in[index] = val;
    }
  //}
}

template <typename T>
void fill(Ptr<Backend> backend, T* begin, T* end, T value) {
  int size = end - begin;
  if (size == 0)
    return;
  CUDA_CHECK(cudaSetDevice(backend->getDeviceId().no));
  int threadsPerBlock = std::min(MAX_THREADS, size);
  int blocks = (size / threadsPerBlock) + (size % threadsPerBlock != 0); // @TODO: (size+threadsPerBlock-1)/threadsPerBlock or CeilDiv(a,b)
  gFill<<<blocks, threadsPerBlock>>>(begin, size, value);
  CUDA_CHECK(cudaStreamSynchronize(0));
}

template <>
void fill<float16>(Ptr<Backend> backend, float16* begin, float16* end, float16 value) {
  int size = end - begin;
  if (size == 0)
    return;
#if COMPILE_FP16
  CUDA_CHECK(cudaSetDevice(backend->getDeviceId().no));
  int threadsPerBlock = std::min(MAX_THREADS, size);
  int blocks = (size / threadsPerBlock) + (size % threadsPerBlock != 0); // @TODO: (size+threadsPerBlock-1)/threadsPerBlock or CeilDiv(a,b)
  gFill<<<blocks, threadsPerBlock>>>((__half*)begin, size, (__half)value);
  CUDA_CHECK(cudaStreamSynchronize(0));
#else
   ABORT("FP16 not supported with current hardware or CUDA version");
#endif
}

template void fill<bool>(Ptr<Backend>, bool*, bool*, bool);
template void fill<int8_t>(Ptr<Backend>, int8_t*, int8_t*, int8_t);
template void fill<int16_t>(Ptr<Backend>, int16_t*, int16_t*, int16_t);
template void fill<int32_t>(Ptr<Backend>, int32_t*, int32_t*, int32_t);
template void fill<int64_t>(Ptr<Backend>, int64_t*, int64_t*, int64_t);
template void fill<uint8_t>(Ptr<Backend>, uint8_t*, uint8_t*, uint8_t);
template void fill<uint16_t>(Ptr<Backend>, uint16_t*, uint16_t*, uint16_t);
template void fill<uint32_t>(Ptr<Backend>, uint32_t*, uint32_t*, uint32_t);
template void fill<uint64_t>(Ptr<Backend>, uint64_t*, uint64_t*, uint64_t);

template void fill<float>(Ptr<Backend>, float*, float*, float);
template void fill<double>(Ptr<Backend>, double*, double*, double);

void setSparse(Ptr<Backend> backend,
               const std::vector<size_t>& keys,
               const std::vector<float>& values,
               float* data) {
  CUDA_CHECK(cudaSetDevice(backend->getDeviceId().no));
  ABORT("no SetSparse");
  // gpu::SetSparse(data, keys, values);
  CUDA_CHECK(cudaStreamSynchronize(0));
}

template <typename T>
__global__ void gSwap(T* d_v1, T* d_v2, int size) {
  auto threadsPerBlock = blockDim.x;
  int index = threadIdx.x + threadsPerBlock * blockIdx.x;
  if(index < size) {
    T temp = d_v1[index];
    d_v1[index] = d_v2[index];
    d_v2[index] = temp;
  }
}

template <typename T>
void swap_ranges(Ptr<Backend> backend, T* begin, T* end, T* dest) {
  int size = end - begin;
  if (size == 0)
    return;

  CUDA_CHECK(cudaSetDevice(backend->getDeviceId().no));
  int threadsPerBlock = std::min(MAX_THREADS, size);
  int blocks = (size / threadsPerBlock) + (size % threadsPerBlock != 0); // @TODO: (size+threadsPerBlock-1)/threadsPerBlock or CeilDiv(a,b)
  gSwap<<<blocks, threadsPerBlock>>>(begin, dest, size);
  CUDA_CHECK(cudaStreamSynchronize(0));
}

template <>
void swap_ranges<float16>(Ptr<Backend> backend, float16* begin, float16* end, float16* dest) {
  int size = end - begin;
  if (size == 0)
    return;

#if COMPILE_FP16
  CUDA_CHECK(cudaSetDevice(backend->getDeviceId().no));
  int threadsPerBlock = std::min(MAX_THREADS, size);
  int blocks = (size / threadsPerBlock) + (size % threadsPerBlock != 0); // @TODO: (size+threadsPerBlock-1)/threadsPerBlock or CeilDiv(a,b)
  gSwap<<<blocks, threadsPerBlock>>>((__half*)begin, (__half*)dest, size);
  CUDA_CHECK(cudaStreamSynchronize(0));
#else
  ABORT("FP16 not supported with current hardware or CUDA version");
#endif
}

// clang-format off
template void swap_ranges<char>(Ptr<Backend>, char*, char*, char*);
template void swap_ranges<int8_t>(Ptr<Backend>, int8_t*, int8_t*, int8_t*);
template void swap_ranges<int16_t>(Ptr<Backend>, int16_t*, int16_t*, int16_t*);
template void swap_ranges<int32_t>(Ptr<Backend>, int32_t*, int32_t*, int32_t*);
template void swap_ranges<int64_t>(Ptr<Backend>, int64_t*, int64_t*, int64_t*);

template void swap_ranges<uint8_t>(Ptr<Backend>, uint8_t*, uint8_t*, uint8_t*);
template void swap_ranges<uint16_t>(Ptr<Backend>, uint16_t*, uint16_t*, uint16_t*);
template void swap_ranges<uint32_t>(Ptr<Backend>, uint32_t*, uint32_t*, uint32_t*);
template void swap_ranges<uint64_t>(Ptr<Backend>, uint64_t*, uint64_t*, uint64_t*);

template void swap_ranges<float>(Ptr<Backend>, float*, float*, float*);
template void swap_ranges<double>(Ptr<Backend>, double*, double*, double*);
// clang-format on

}  // namespace gpu
}  // namespace marian