.. _program_listing_file_src_tensors_cpu_expression_graph_packable.h: Program Listing for File expression_graph_packable.h ==================================================== |exhale_lsh| :ref:`Return to documentation for file ` (``src/tensors/cpu/expression_graph_packable.h``) .. |exhale_lsh| unicode:: U+021B0 .. UPWARDS ARROW WITH TIP LEFTWARDS .. code-block:: cpp #pragma once #include "graph/expression_graph.h" #include "fbgemm/packed_gemm.h" #include "tensors/cpu/integer_common.h" namespace marian { namespace cpu { void Transpose10(marian::Tensor out, const marian::Tensor in); } } namespace marian { // When FBGEMM based packed GEMM is used, some weight matrices need to be packed offline. // The decision which weights can be packed or not should be done walking through the graph. // This requires some more changes, but we temporarily do this just by name ("_W") of the weights. // And, this introduces a low level packed_gemm.h apis interact with high level graph class. // So, we make a subclass of ExpressionGraph and put those immature codes in this class. // We will improve this in the near future. class ExpressionGraphPackable : public ExpressionGraph { public: ExpressionGraphPackable() : ExpressionGraph( /* inference = */ true) {} // Packable expression graph only supports inference virtual ~ExpressionGraphPackable() {} // Convert model weights into packed format and save to IO items. std::vector pack(Type gemmElementType = Type::float32, Type saveElementType = Type::float32) { std::vector ioItems; // handle packable parameters first (a float32 parameter is packable) auto packableParameters = paramsByElementType_[Type::float32]; // sorted by name in std::map for (auto p : packableParameters->getMap()) { std::string pName = p.first; LOG(info, "Processing parameter {} with shape {} and type {}", pName, p.second->shape(), p.second->value_type()); if (!namespace_.empty()) { if (pName.substr(0, namespace_.size() + 2) == namespace_ + "::") pName = pName.substr(namespace_.size() + 2); } Tensor val = p.second->val(); // save as packed format // @TODO Hardcoded to find packable weights // int8 - all the weights used for affine op and dot op // fp16 - all the weights used for affine op if ((gemmElementType == Type::packed8avx2 || gemmElementType == Type::packed8avx512) && (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2)) { #if USE_FBGEMM using namespace marian::cpu::variant; // packing information - size int nrow; int ncol; uint64_t packsize; fbgemmPacked8PackInfo(val->shape(), gemmElementType, pName.find("Wemb") != std::string::npos, nrow, ncol, packsize); auto allocator = New(getBackend()); // buffer tensor to save packed matrix Tensor packedTensor; allocator->allocate(packedTensor, { 1, (int32_t)packsize }, Type::uint8); //Pack B matrix into int8 fbgemmPacked8Pack(packedTensor, val->data(), gemmElementType, pName.find("Wemb") != std::string::npos, nrow, ncol, packsize); io::Item item; item.name = pName; item.shape = val->shape(); item.type = gemmElementType; // Use the actual memory as this will be aligned and padded. // When memory mapping this is required. Shape keeps track of // tensor size. Saving to *.npz will cut to size. auto mem = packedTensor->memory(); item.bytes.resize(mem->size()); copy(backend_, mem->data(), mem->data() + mem->size(), item.bytes.data()); ioItems.emplace_back(std::move(item)); #else ABORT("Packed type {} only supported when compiled with -DUSE_FBGEMM=on", gemmElementType); #endif // fp16 quantization option } else if (gemmElementType == Type::packed16 && pName.find("_W") == pName.length() - 3) { #if USE_FBGEMM using namespace marian::cpu::variant; // packing information int nrow, ncol, kernel_ncol_blocks, brow, bcol, last_brow, nbrow, nbcol; uint64_t packsize; fbgemmPacked16PackInfo(val->shape(), false, nrow, ncol, kernel_ncol_blocks, brow, bcol, last_brow, nbrow, nbcol, packsize); auto allocator = New(getBackend()); Tensor packedTensor; allocator->allocate(packedTensor, { 1, (int32_t)packsize }, Type::uint8); // fbgemmPacked16Pack fbgemmPacked16Pack(packedTensor, val->data(), false, nrow, ncol, kernel_ncol_blocks, brow, bcol, last_brow, nbrow, nbcol, packsize); io::Item item; item.name = pName; item.shape = val->shape(); item.type = Type::packed16; // Use the actual memory as this will be aligned and padded. // When memory mapping this is required. Shape keeps track of // tensor size. Saving to *.npz will cut to size. auto mem = packedTensor->memory(); item.bytes.resize(mem->size()); copy(backend_, mem->data(), mem->data() + mem->size(), item.bytes.data()); ioItems.emplace_back(std::move(item)); #else ABORT("Packed type {} only supported when compiled with -DUSE_FBGEMM=on", gemmElementType); #endif } else if (isIntgemm(gemmElementType) && (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2 /* || pName.find("Wemb") != std::string::npos*/)) { #if COMPILE_CPU using cpu::integer::cols; using cpu::integer::rows; auto allocator = New(getBackend()); Tensor paramMat; //This allocates extra 4 bytes at the end because of gemmElementType allocator->allocate(paramMat, val->shape(), gemmElementType); // Compute QuantMultiplier, compress matrix and store quantMult at the end. // We need to tranpose first, because of our architecture independet format requiring a transposed matrix Tensor tmp; allocator->allocate(tmp, val->shape(), val->type()); cpu::Transpose10(tmp, val); if(sizeOf(gemmElementType) == 1) { // is 8-bit Intgemm type float quantMult = cpu::integer::computeQuantMult(val); // Hardware-specific conversions which allow to implement memory-mapping and avoid conversion at runtime cpu::integer::passOrAbort(gemmElementType); // Check if the hardware supports the GEMM type if(isSsse3(gemmElementType)) { intgemm::SSSE3::Kernels8::PrepareBTransposed(tmp->data(), /*input*/ paramMat->data(), /*output*/ quantMult, /*Quant Mult*/ rows(val), cols(val)); } else if(isAvx2(gemmElementType)) { intgemm::AVX2::Kernels8::PrepareBTransposed(tmp->data(), /*input*/ paramMat->data(), /*output*/ quantMult, /*Quant Mult*/ rows(val), cols(val)); } else if(isAvx512(gemmElementType)) { intgemm::AVX512BW::Kernels8::PrepareBTransposed(tmp->data(), /*input*/ paramMat->data(), /*output*/ quantMult, /*Quant Mult*/ rows(val), cols(val)); } else { ABORT_IF(gemmElementType != Type::intgemm8, "Type {} is not supported", gemmElementType); // shouldn't really happen, but let's make sure intgemm::Int8::PrepareA(tmp->data(), /*input*/ paramMat->data(), /*output*/ quantMult, /*Quant Mult*/ rows(val), cols(val)); } //Put the quantMult at the back of the tensor cpu::integer::getQuantMult(paramMat) = quantMult; } else if(sizeOf(gemmElementType) == 2) { // is 16-bit Intgemm type float quantMult = cpu::integer::computeQuantMult(val); // Hardware-specific conversions which allow to implement memory-mapping and avoid conversion at runtime cpu::integer::passOrAbort(gemmElementType); // Check if the hardware supports the GEMM type if(isSse2(gemmElementType)) { intgemm::SSE2::Kernels16::PrepareBTransposed(tmp->data(), /*input*/ paramMat->data(), /*output*/ quantMult, /*Quant Mult*/ rows(val), cols(val)); } else if(isAvx2(gemmElementType)) { intgemm::AVX2::Kernels16::PrepareBTransposed(tmp->data(), /*input*/ paramMat->data(), /*output*/ quantMult, /*Quant Mult*/ rows(val), cols(val)); } else if(isAvx512(gemmElementType)) { intgemm::AVX512BW::Kernels16::PrepareBTransposed(tmp->data(), /*input*/ paramMat->data(), /*output*/ quantMult, /*Quant Mult*/ rows(val), cols(val)); } else { ABORT_IF(gemmElementType != Type::intgemm16, "Type {} is not supported", gemmElementType); // shouldn't really happen, but let's make sure intgemm::Int16::PrepareA(tmp->data(), /*input*/ paramMat->data(), /*output*/ quantMult, /*Quant Mult*/ rows(val), cols(val)); } //Put the quantMult at the back of the tensor cpu::integer::getQuantMult(paramMat) = quantMult; } else { ABORT("Incorrect Intgemm type size: {}", sizeOf(gemmElementType)); } //Save... Same as the fbgemm case io::Item item; item.name = pName; item.shape = val->shape(); item.type = gemmElementType; auto mem = paramMat->memory(); item.bytes.resize(mem->size()); copy(backend_, mem->data(), mem->data() + mem->size(), item.bytes.data()); ioItems.emplace_back(std::move(item)); #else ABORT("Packed type {} only supported when compiled with -DCOMPILE_CPU=on", gemmElementType); #endif } else { ABORT_IF(saveElementType != Type::float32, "We currently do not know how to save matrices as {}", saveElementType); io::Item item; val->get(item, pName); item.convert(saveElementType); ioItems.emplace_back(std::move(item)); } } // Now handle all non-float32 parameters for(auto& iter : paramsByElementType_) { auto type = iter.first; if(type == Type::float32) continue; for (auto p : iter.second->getMap()) { std::string pName = p.first; LOG(info, "Processing parameter {} with shape {} and type {}", pName, p.second->shape(), p.second->value_type()); if (!namespace_.empty()) { if (pName.substr(0, namespace_.size() + 2) == namespace_ + "::") pName = pName.substr(namespace_.size() + 2); } Tensor val = p.second->val(); io::Item item; val->get(item, pName); ioItems.emplace_back(std::move(item)); } } return ioItems; } void packAndSave(const std::string& name, const std::string& meta, Type gemmElementType = Type::float32, Type saveElementType = Type::float32) { auto ioItems = pack(gemmElementType, saveElementType); if (!meta.empty()) io::addMetaToItems(meta, "special:model.yml", ioItems); io::saveItems(name, ioItems); } }; } // namespace marian