.. _program_listing_file_src_tensors_cpu_prod.cpp: Program Listing for File prod.cpp ================================= |exhale_lsh| :ref:`Return to documentation for file ` (``src/tensors/cpu/prod.cpp``) .. |exhale_lsh| unicode:: U+021B0 .. UPWARDS ARROW WITH TIP LEFTWARDS .. code-block:: cpp /* All or part of this file was contributed by Intel under license: * Copyright (C) 2017-2018 Intel Corporation * SPDX-License-Identifier: MIT */ #include "tensors/cpu/backend.h" #include "tensors/tensor.h" #include "tensors/tensor_allocator.h" #if MKL_FOUND #include #else #if BLAS_FOUND #include #endif #endif #include "integer_common.h" #include "prod_blas.h" namespace marian { namespace cpu { void Prod(marian::Tensor C, const marian::Tensor& A, const marian::Tensor& B, bool transA, bool transB, float beta, float scalar) { #if BLAS_FOUND float alpha = scalar; int m = A->shape().elements() / A->shape()[-1]; int k = A->shape().back(); if(transA) std::swap(m, k); int l = B->shape().elements() / B->shape()[-1]; int n = B->shape()[-1]; if(transB) std::swap(l, n); int lda = A->shape()[-1]; int ldb = B->shape()[-1]; int ldc = B->shape()[-1]; if(transB) ldc = B->shape().elements() / B->shape()[-1]; sgemm(transA, transB, m, n, k, alpha, A->data(), lda, B->data(), ldb, beta, C->data(), ldc); #else C; A; B; transA; transB; beta; scalar; ABORT("You need to compile with MKL in order to use the CPU version"); #endif } // dummy implementation, computeType doesn't do anything on CPU void Prod(marian::Tensor C, const marian::Tensor& A, const marian::Tensor& B, bool transA, bool transB, float beta, float scalar, Type computeType) { computeType; // make compiler happy cpu::Prod(C, A, B, transA, transB, beta, scalar); } void ProdBatched(marian::Tensor C, Ptr /*allocator*/, const marian::Tensor A, const marian::Tensor B, bool transA, bool transB, float beta, float scalar) { #if BLAS_FOUND float alpha = scalar; // determine meta-shape of bdot operation. Essentially treat the last two dimensions as single elements // such that (..., m, k) x (..., k, n) -> (..., m, n) where ... is a broadcastable shape as in element-wise kernels. auto aShape = A->shape(); auto bShape = B->shape(); // make sure both shape have the same number of dimensions via broadcasting size_t maxLength = std::max(aShape.size(), bShape.size()); if(aShape.size() != bShape.size()) { Shape ones(std::vector(maxLength, 1)); aShape = Shape::broadcast({aShape, ones}); bShape = Shape::broadcast({bShape, ones}); } // Create meta-shapes without last 2 dimensions Shape aShapeMeta, bShapeMeta, cShapeMeta; aShapeMeta.resize(maxLength - 2); bShapeMeta.resize(maxLength - 2); for(size_t i = 0; i < maxLength - 2; ++i) { aShapeMeta.set(i, aShape[i]); bShapeMeta.set(i, bShape[i]); } cShapeMeta = Shape::broadcast({aShapeMeta, bShapeMeta}); int m = aShape[-2]; int k = aShape[-1]; if(transA) std::swap(m, k); int l = bShape[-2]; int n = bShape[-1]; if(transB) std::swap(l, n); int lda = aShape[-1]; int ldb = bShape[-1]; int ldc = bShape[-1]; if(transB) ldc = bShape[-2]; int strideA = m * k; int strideB = n * k; int strideC = n * m; int batchC = cShapeMeta.elements(); // Convert to functional shapes to be able to map dimensions. @TODO merge this functional::Shape aShapeMetaF = aShapeMeta; functional::Shape bShapeMetaF = bShapeMeta; functional::Shape cShapeMetaF = cShapeMeta; #if MKL_FOUND CBLAS_TRANSPOSE transA_forarr = CblasNoTrans; CBLAS_TRANSPOSE transB_forarr = CblasNoTrans; if(transA) transA_forarr = CblasTrans; if(transB) transB_forarr = CblasTrans; /* cblas_sgemm_batch allows us to group all the small GEMMs that are done in a for loop with sgemm and compute * them in only one MKL call. For the API documentation refer to * https://software.intel.com/content/www/us/en/develop/documentation/mkl-developer-reference-c/top/blas-and-sparse-blas-routines/blas-like-extensions/cblas-gemm-batch.html * The API supports dependencies, where you can specify one "group" of GEMMs to be computed after another. (This controlled by the group_count parameter). * In our case, the operations are not dependent on one another so we hardcode one group. The rest of the arguments (with the exception of group_size) are * the same as the ones that cblas_sgemm expects, with the difference that we are supposed to provide an array pointer (One element per group). * Weirdly enough, we are required to to provide all of the integer arguments as the MKL_INT datatype */ static const constexpr size_t group_count = 1; // We have one group const std::vector transa_arr(group_count, transA_forarr); const std::vector transb_arr(group_count, transB_forarr); const std::vector m_arr(group_count, (MKL_INT)m); const std::vector n_arr(group_count, (MKL_INT)n); const std::vector k_arr(group_count, (MKL_INT)k); const std::vector alpha_arr(group_count, alpha); const std::vector beta_arr(group_count, beta); const std::vector lda_arr(group_count, (MKL_INT)lda); const std::vector ldb_arr(group_count, (MKL_INT)ldb); const std::vector ldc_arr(group_count, (MKL_INT)ldc); const std::vector group_size(group_count, (MKL_INT)batchC); // Group size specifies number of GEMM operations per group (Which is batchC) std::vector a_array(batchC, nullptr); std::vector b_array(batchC, nullptr); std::vector c_array(batchC, nullptr); // This loop initializes the array pointers in the same way as the for loop // in the normal sgemm version a few lines below functional::Array dims; for(int i = 0; i < batchC; ++i) { cShapeMetaF.dims(i, dims); auto aIndex = aShapeMetaF.bindex(dims); auto bIndex = bShapeMetaF.bindex(dims); a_array[i] = A->data() + aIndex * strideA; b_array[i] = B->data() + bIndex * strideB; c_array[i] = C->data() + i * strideC; } cblas_sgemm_batch (CblasRowMajor, &transa_arr[0], &transb_arr[0], &m_arr[0], &n_arr[0], &k_arr[0], &alpha_arr[0], &a_array[0], &lda_arr[0], &b_array[0], &ldb_arr[0], &beta_arr[0], &c_array[0], &ldc_arr[0], group_count, &group_size[0]); #else functional::Array dims; for(size_t i = 0; i < batchC; ++i) { cShapeMetaF.dims(i, dims); auto aIndex = aShapeMetaF.bindex(dims); auto bIndex = bShapeMetaF.bindex(dims); sgemm(transA, transB, m, n, k, alpha, A->data() + aIndex * strideA, lda, B->data() + bIndex * strideB, ldb, beta, C->data() + i * strideC, ldc); } #endif #else C; A; B; transA; transB; beta; scalar; ABORT("You need to compile with MKL in order to use the CPU version"); #endif } void ProdBatchedLegacy(marian::Tensor C, Ptr /*allocator*/, const marian::Tensor A, const marian::Tensor B, bool transA, bool transB, float beta, float scalar) { #if BLAS_FOUND float alpha = scalar; size_t batchA = A->shape().elements() / (A->shape()[-1] * A->shape()[-2]); size_t batchB = B->shape().elements() / (B->shape()[-1] * B->shape()[-2]); size_t m = A->shape()[-2]; size_t k = A->shape()[-1]; if(transA) std::swap(m, k); size_t l = B->shape()[-2]; size_t n = B->shape()[-1]; if(transB) std::swap(l, n); size_t lda = A->shape()[-1]; size_t ldb = B->shape()[-1]; size_t ldc = B->shape()[-1]; if(transB) ldc = B->shape()[-2]; auto strideB = batchB == 1 ? 0 : n * k; auto strideA = batchA == 1 ? 0 : m * k; auto strideC = n * m; auto batchC = std::max(batchA, batchB); #if MKL_FOUND CBLAS_TRANSPOSE transA_forarr = CblasNoTrans; CBLAS_TRANSPOSE transB_forarr = CblasNoTrans; if(transA) transA_forarr = CblasTrans; if(transB) transB_forarr = CblasTrans; /* cblas_sgemm_batch allows us to group all the small GEMMs that are done in a for loop with sgemm and compute * them in only one MKL call. For the API documentation refer to * https://software.intel.com/content/www/us/en/develop/documentation/mkl-developer-reference-c/top/blas-and-sparse-blas-routines/blas-like-extensions/cblas-gemm-batch.html * The API supports dependencies, where you can specify one "group" of GEMMs to be computed after another. (This controlled by the group_count parameter). * In our case, the operations are not dependent on one another so we hardcode one group. The rest of the arguments (with the exception of group_size) are * the same as the ones that cblas_sgemm expects, with the difference that we are supposed to provide an array pointer (One element per group). * Weirdly enough, we are required to to provide all of the integer arguments as the MKL_INT datatype */ static const constexpr size_t group_count = 1; // We have one group const std::vector transa_arr(group_count, transA_forarr); const std::vector transb_arr(group_count, transB_forarr); const std::vector m_arr(group_count, (MKL_INT)m); const std::vector n_arr(group_count, (MKL_INT)n); const std::vector k_arr(group_count, (MKL_INT)k); const std::vector alpha_arr(group_count, alpha); const std::vector beta_arr(group_count, beta); const std::vector lda_arr(group_count, (MKL_INT)lda); const std::vector ldb_arr(group_count, (MKL_INT)ldb); const std::vector ldc_arr(group_count, (MKL_INT)ldc); const std::vector group_size(group_count, (MKL_INT)batchC); // Group size specifies number of GEMM operations per group (Which is batchC) std::vector a_array(batchC, nullptr); std::vector b_array(batchC, nullptr); std::vector c_array(batchC, nullptr); // This loop initializes the array pointers in the same way as the for loop // in the normal sgemm version a few lines below for(size_t i = 0; i < batchC; ++i) { a_array[i] = A->data() + (i % batchA) * strideA; b_array[i] = B->data() + (i % batchB) * strideB; c_array[i] = C->data() + i * strideC; } cblas_sgemm_batch (CblasRowMajor, &transa_arr[0], &transb_arr[0], &m_arr[0], &n_arr[0], &k_arr[0], &alpha_arr[0], &a_array[0], &lda_arr[0], &b_array[0], &ldb_arr[0], &beta_arr[0], &c_array[0], &ldc_arr[0], group_count, &group_size[0]); #else for(size_t i = 0; i < batchC; ++i) { sgemm(transA, transB, (int)m, (int)n, (int)k, alpha, A->data() + (i % batchA) * strideA, (int)lda, B->data() + (i % batchB) * strideB, (int)ldb, beta, C->data() + i * strideC, (int)ldc); } #endif #else C; A; B; transA; transB; beta; scalar; ABORT("You need to compile with MKL in order to use the CPU version"); #endif } void ProdWithBias(marian::Tensor C, const marian::Tensor& A, const marian::Tensor& B, const marian::Tensor& bias, bool transA, bool transB, float beta, float scalar) { cpu::Prod(C, A, B, transA, transB, beta, scalar); cpu::integer::AddBias(C, bias); } void Affine(marian::Tensor C, Ptr /*allocator*/, const marian::Tensor& A, const marian::Tensor& B, const marian::Tensor& bias, bool transA, bool transB, float beta, float scalar, bool reluPostprocess) { using namespace functional; ProdWithBias(C, A, B, bias, transA, transB, beta, scalar); if(reluPostprocess) cpu::Element(_1 = ReLU(_1), C); // @TODO: also fuse with AddBias } void CSRProd(marian::Tensor C, Ptr /*allocator*/, const marian::Tensor& S_values, const marian::Tensor& S_indices, const marian::Tensor& S_offsets, const marian::Tensor& D, bool transS, bool swapOperands, float beta) { C, S_values, S_indices, S_offsets, D; // Note: The CPU implementation currently only implements what's needed for decoding. // interpret tensor dimensions as matrix dimensions const auto& shapeC = C->shape(); const auto& shapeD = D->shape(); // If swapOperands, S and D are swapped (C = D x S instead of C = S x D). // In that case, in the next 6 lines, please read all dimensions as if they were reversed in order. auto rowsC = shapeC[-(int)swapOperands]; auto colsC = shapeC.elements() / rowsC; auto rowsD = shapeD[-(int)swapOperands]; auto colsD = shapeD.elements() / rowsD; auto rowsS = transS ? rowsD : rowsC; auto colsS = transS ? rowsC : rowsD; ABORT_IF(colsD != colsC, "Inconsistent outer dimensions in CSR product"); if (swapOperands) { // make rowsX actual row dimensions again, likewise colsX std::swap(rowsC, colsC); std::swap(rowsD, colsD); std::swap(rowsS, colsS); } // sparse arrays auto numOffsets = S_offsets->shape().elements() - 1; // -1 since last value is length ABORT_IF(numOffsets != rowsS, "Unexpected number of rows in CSR argument"); numOffsets; ABORT_IF(S_values->shape() != S_indices->shape(), "CSR values and indices must have the same size"); if (!transS && !swapOperands) { // C = S * D, where D = CSR matrix const auto* offsets = S_offsets->data(); const auto* indices = S_indices->data(); const auto* values = S_values->data(); const auto* dataD = D->data(); auto* dataC = C->data(); ABORT_IF(beta != 0 && beta != 1, "cpu::CSRProd only supports beta = 0 or 1"); for (size_t i = 0; i < rowsC; i++) { auto add = (beta == 1); // first element: overwrite or add according to beta; subsequent elements: add for (size_t kk = offsets[i]; kk < offsets[i + 1]; kk++) { auto k = indices[kk]; // fetch the non-zero row auto valS = values[kk]; // and the value from that row // This code is written with the hope for good vectorization, and the hope // that adding to memory will be done efficiently by the caching system. if (valS == 1) if (!add) for (size_t j = 0; j < colsC; j++) dataC[i * colsC/*==colsD*/ + j] = dataD[k * colsC/*==colsD*/ + j]; // this is a memcpy() else for (size_t j = 0; j < colsC; j++) dataC[i * colsC/*==colsD*/ + j] += dataD[k * colsC/*==colsD*/ + j]; // this is a contiguous-vector addition else if (!add) for (size_t j = 0; j < colsC; j++) dataC[i * colsC/*==colsD*/ + j] = valS * dataD[k * colsC/*==colsD*/ + j]; else for (size_t j = 0; j < colsC; j++) dataC[i * colsC/*==colsD*/ + j] += valS * dataD[k * colsC/*==colsD*/ + j]; // notice the += add = true; // next iteration will add to existing result } } } else ABORT("CSRProd for transS={}, swapOperands={} is not yet implemented for CPU", transS, swapOperands); } } // namespace cpu } // namespace marian