.. _program_listing_file_src_tensors_gpu_prod_sparse_cu11.h: Program Listing for File prod_sparse_cu11.h =========================================== |exhale_lsh| :ref:`Return to documentation for file ` (``src/tensors/gpu/prod_sparse_cu11.h``) .. |exhale_lsh| unicode:: U+021B0 .. UPWARDS ARROW WITH TIP LEFTWARDS .. code-block:: cpp #ifdef _MSC_VER #pragma warning(disable: 4505) // warning C4505: '__float2half_rz': unreferenced local function has been removed (missing 'static inline') #endif #include #include // clang-format off #include "tensors/gpu/prod.h" #include "tensors/gpu/backend.h" #include "tensors/gpu/cuda_helpers.h" // clang-format on namespace marian { namespace gpu { // primary template for specialization with different element and compute types template struct TypedSparseGemm { static cudaDataType getCudaDataType(const float*) { return CUDA_R_32F; }; static cudaDataType getCudaDataType(const half*) { return CUDA_R_16F; }; #if 0 static void CSRProdSwapped(marian::Tensor C, Ptr allocator, const marian::Tensor& S_values, const marian::Tensor& S_indices, const marian::Tensor& S_offsets, const marian::Tensor& D, bool transS, ElementType beta) { cudaSetDevice((int)C->getDeviceId().no); auto cusparseHandle = std::static_pointer_cast(C->getBackend())->getCusparseHandle(); // interpret tensor dimensions as matrix dimensions const auto& shapeC = C->shape(); const auto& shapeD = D->shape(); auto colsC = shapeC[-1]; auto rowsC = shapeC.elements() / colsC; auto colsD = shapeD[-1]; auto rowsD = shapeD.elements() / colsD; auto rowsS = rowsC; auto colsS = rowsD; auto denseOrder = CUSPARSE_ORDER_COL; auto algorithm = CUSPARSE_SPMM_ALG_DEFAULT; std::cerr << shapeC << std::endl; std::cerr << shapeD << std::endl; if(transS) std::swap(rowsS, colsS); // sparse arrays auto numValues = S_values->shape().elements(); auto numOffsets = S_offsets->shape().elements() - 1; // -1 since last value is length ABORT_IF(numOffsets != rowsS, "Unexpected number of rows in CSR argument"); ABORT_IF(S_values->shape() != S_indices->shape(), "CSR values and indices must have the same size"); ElementType alpha = 1.0; cusparseSpMatDescr_t descS; cusparseDnMatDescr_t descD; cusparseDnMatDescr_t descC; CUSPARSE_CHECK(cusparseCreateCsr(&descS, rowsS, colsS, numValues, S_offsets->data(), S_indices->data(), S_values ->data(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, getCudaDataType(S_values->data()))); CUSPARSE_CHECK(cusparseCreateDnMat(&descD, rowsD, colsD, /*ld=*/colsD, D->data(), getCudaDataType(D->data()), denseOrder)); CUSPARSE_CHECK(cusparseCreateDnMat(&descC, rowsC, colsC, /*ld=*/colsC, C->data(), getCudaDataType(C->data()), denseOrder)); size_t bufferSize = 0; CUSPARSE_CHECK(cusparseSpMM_bufferSize(cusparseHandle, transS ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descS, descD, &beta, descC, getCudaDataType(C->data()), algorithm, &bufferSize)); if(bufferSize > 0) { MemoryPiece::PtrType buffer = allocator->alloc(bufferSize); CUSPARSE_CHECK(cusparseSpMM(cusparseHandle, transS ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descS, descD, &beta, descC, getCudaDataType(C->data()), algorithm, buffer->data())); allocator->free(buffer); } CUSPARSE_CHECK(cusparseDestroySpMat(descS)); CUSPARSE_CHECK(cusparseDestroyDnMat(descD)); CUSPARSE_CHECK(cusparseDestroyDnMat(descC)); } #endif // C = op(S) x D if not swapOperands else C = D x op(S) // op(S) = S if not transA else S^T static void CSRProd(marian::Tensor C, Ptr allocator, const marian::Tensor& S_values, const marian::Tensor& S_indices, const marian::Tensor& S_offsets, const marian::Tensor& D, bool transS, ElementType beta) { cudaSetDevice((int)C->getDeviceId().no); auto cusparseHandle = std::static_pointer_cast(C->getBackend())->getCusparseHandle(); // interpret tensor dimensions as matrix dimensions const auto& shapeC = C->shape(); const auto& shapeD = D->shape(); auto colsC = shapeC[-1]; auto rowsC = shapeC.elements() / colsC; auto colsD = shapeD[-1]; auto rowsD = shapeD.elements() / colsD; auto rowsS = rowsC; auto colsS = rowsD; auto denseOrder = CUSPARSE_ORDER_ROW; auto algorithm = CUSPARSE_SPMM_CSR_ALG2; if(transS) std::swap(rowsS, colsS); // sparse arrays auto numValues = S_values->shape().elements(); auto numOffsets = S_offsets->shape().elements() - 1; // -1 since last value is length ABORT_IF(numOffsets != rowsS, "Unexpected number of rows in CSR argument"); ABORT_IF(S_values->shape() != S_indices->shape(), "CSR values and indices must have the same size"); ElementType alpha = 1.0; cusparseSpMatDescr_t descS; cusparseDnMatDescr_t descD; cusparseDnMatDescr_t descC; CUSPARSE_CHECK(cusparseCreateCsr(&descS, rowsS, colsS, numValues, S_offsets->data(), S_indices->data(), S_values ->data(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, getCudaDataType(S_values->data()))); CUSPARSE_CHECK(cusparseCreateDnMat(&descD, rowsD, colsD, /*ld=*/colsD, D->data(), getCudaDataType(D->data()), denseOrder)); CUSPARSE_CHECK(cusparseCreateDnMat(&descC, rowsC, colsC, /*ld=*/colsC, C->data(), getCudaDataType(C->data()), denseOrder)); size_t bufferSize = 0; CUSPARSE_CHECK(cusparseSpMM_bufferSize(cusparseHandle, transS ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descS, descD, &beta, descC, getCudaDataType(C->data()), algorithm, &bufferSize)); if(bufferSize > 0) { MemoryPiece::PtrType buffer = allocator->alloc(bufferSize); CUSPARSE_CHECK(cusparseSpMM(cusparseHandle, transS ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descS, descD, &beta, descC, getCudaDataType(C->data()), algorithm, buffer->data())); allocator->free(buffer); } CUSPARSE_CHECK(cusparseDestroySpMat(descS)); CUSPARSE_CHECK(cusparseDestroyDnMat(descD)); CUSPARSE_CHECK(cusparseDestroyDnMat(descC)); } static void CSRProd(marian::Tensor C, Ptr allocator, const marian::Tensor& S_values, const marian::Tensor& S_indices, const marian::Tensor& S_offsets, const marian::Tensor& D, bool transS, bool swapOperands, ElementType beta) { if(swapOperands) { ABORT("Not implemented"); // CSRProdSwapped(C, allocator, S_values, S_indices, S_offsets, D, transS, beta); } else { CSRProd(C, allocator, S_values, S_indices, S_offsets, D, transS, beta); } } }; } // namespace gpu } // namespace marian