Program Listing for File expression_graph_packable.h¶
↰ Return to documentation for file (src/tensors/cpu/expression_graph_packable.h
)
#pragma once
#include "graph/expression_graph.h"
#include "fbgemm/packed_gemm.h"
#include "tensors/cpu/integer_common.h"
namespace marian {
namespace cpu {
void Transpose10(marian::Tensor out, const marian::Tensor in);
}
}
namespace marian {
// When FBGEMM based packed GEMM is used, some weight matrices need to be packed offline.
// The decision which weights can be packed or not should be done walking through the graph.
// This requires some more changes, but we temporarily do this just by name ("_W") of the weights.
// And, this introduces a low level packed_gemm.h apis interact with high level graph class.
// So, we make a subclass of ExpressionGraph and put those immature codes in this class.
// We will improve this in the near future.
class ExpressionGraphPackable : public ExpressionGraph {
public:
ExpressionGraphPackable()
: ExpressionGraph( /* inference = */ true) {} // Packable expression graph only supports inference
virtual ~ExpressionGraphPackable() {}
// Convert model weights into packed format and save to IO items.
std::vector<io::Item> pack(Type gemmElementType = Type::float32, Type saveElementType = Type::float32) {
std::vector<io::Item> ioItems;
// handle packable parameters first (a float32 parameter is packable)
auto packableParameters = paramsByElementType_[Type::float32];
// sorted by name in std::map
for (auto p : packableParameters->getMap()) {
std::string pName = p.first;
LOG(info, "Processing parameter {} with shape {} and type {}", pName, p.second->shape(), p.second->value_type());
if (!namespace_.empty()) {
if (pName.substr(0, namespace_.size() + 2) == namespace_ + "::")
pName = pName.substr(namespace_.size() + 2);
}
Tensor val = p.second->val();
// save as packed format
// @TODO Hardcoded to find packable weights
// int8 - all the weights used for affine op and dot op
// fp16 - all the weights used for affine op
if ((gemmElementType == Type::packed8avx2 || gemmElementType == Type::packed8avx512)
&& (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2)) {
#if USE_FBGEMM
using namespace marian::cpu::variant;
// packing information - size
int nrow;
int ncol;
uint64_t packsize;
fbgemmPacked8PackInfo(val->shape(),
gemmElementType,
pName.find("Wemb") != std::string::npos,
nrow,
ncol,
packsize);
auto allocator = New<TensorAllocator>(getBackend());
// buffer tensor to save packed matrix
Tensor packedTensor;
allocator->allocate(packedTensor, { 1, (int32_t)packsize }, Type::uint8);
//Pack B matrix into int8
fbgemmPacked8Pack(packedTensor,
val->data(),
gemmElementType,
pName.find("Wemb") != std::string::npos,
nrow,
ncol,
packsize);
io::Item item;
item.name = pName;
item.shape = val->shape();
item.type = gemmElementType;
// Use the actual memory as this will be aligned and padded.
// When memory mapping this is required. Shape keeps track of
// tensor size. Saving to *.npz will cut to size.
auto mem = packedTensor->memory();
item.bytes.resize(mem->size());
copy(backend_, mem->data<char>(), mem->data<char>() + mem->size(), item.bytes.data());
ioItems.emplace_back(std::move(item));
#else
ABORT("Packed type {} only supported when compiled with -DUSE_FBGEMM=on", gemmElementType);
#endif
// fp16 quantization option
} else if (gemmElementType == Type::packed16 && pName.find("_W") == pName.length() - 3) {
#if USE_FBGEMM
using namespace marian::cpu::variant;
// packing information
int nrow, ncol, kernel_ncol_blocks, brow, bcol, last_brow, nbrow, nbcol;
uint64_t packsize;
fbgemmPacked16PackInfo(val->shape(),
false,
nrow,
ncol,
kernel_ncol_blocks,
brow,
bcol,
last_brow,
nbrow,
nbcol,
packsize);
auto allocator = New<TensorAllocator>(getBackend());
Tensor packedTensor;
allocator->allocate(packedTensor, { 1, (int32_t)packsize }, Type::uint8);
// fbgemmPacked16Pack
fbgemmPacked16Pack(packedTensor,
val->data(),
false,
nrow,
ncol,
kernel_ncol_blocks,
brow,
bcol,
last_brow,
nbrow,
nbcol,
packsize);
io::Item item;
item.name = pName;
item.shape = val->shape();
item.type = Type::packed16;
// Use the actual memory as this will be aligned and padded.
// When memory mapping this is required. Shape keeps track of
// tensor size. Saving to *.npz will cut to size.
auto mem = packedTensor->memory();
item.bytes.resize(mem->size());
copy(backend_, mem->data<char>(), mem->data<char>() + mem->size(), item.bytes.data());
ioItems.emplace_back(std::move(item));
#else
ABORT("Packed type {} only supported when compiled with -DUSE_FBGEMM=on", gemmElementType);
#endif
} else if (isIntgemm(gemmElementType) &&
(pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2 /* || pName.find("Wemb") != std::string::npos*/)) {
#if COMPILE_CPU
using cpu::integer::cols;
using cpu::integer::rows;
auto allocator = New<TensorAllocator>(getBackend());
Tensor paramMat; //This allocates extra 4 bytes at the end because of gemmElementType
allocator->allocate(paramMat, val->shape(), gemmElementType);
// Compute QuantMultiplier, compress matrix and store quantMult at the end.
// We need to tranpose first, because of our architecture independet format requiring a transposed matrix
Tensor tmp;
allocator->allocate(tmp, val->shape(), val->type());
cpu::Transpose10(tmp, val);
if(sizeOf(gemmElementType) == 1) { // is 8-bit Intgemm type
float quantMult = cpu::integer::computeQuantMult<Type::intgemm8>(val);
// Hardware-specific conversions which allow to implement memory-mapping and avoid conversion at runtime
cpu::integer::passOrAbort(gemmElementType); // Check if the hardware supports the GEMM type
if(isSsse3(gemmElementType)) {
intgemm::SSSE3::Kernels8::PrepareBTransposed(tmp->data(), /*input*/
paramMat->data<int8_t>(), /*output*/
quantMult, /*Quant Mult*/
rows(val),
cols(val));
} else if(isAvx2(gemmElementType)) {
intgemm::AVX2::Kernels8::PrepareBTransposed(tmp->data(), /*input*/
paramMat->data<int8_t>(), /*output*/
quantMult, /*Quant Mult*/
rows(val),
cols(val));
} else if(isAvx512(gemmElementType)) {
intgemm::AVX512BW::Kernels8::PrepareBTransposed(tmp->data(), /*input*/
paramMat->data<int8_t>(), /*output*/
quantMult, /*Quant Mult*/
rows(val),
cols(val));
} else {
ABORT_IF(gemmElementType != Type::intgemm8, "Type {} is not supported", gemmElementType); // shouldn't really happen, but let's make sure
intgemm::Int8::PrepareA(tmp->data(), /*input*/
paramMat->data<int8_t>(), /*output*/
quantMult, /*Quant Mult*/
rows(val),
cols(val));
}
//Put the quantMult at the back of the tensor
cpu::integer::getQuantMult<Type::intgemm8>(paramMat) = quantMult;
} else if(sizeOf(gemmElementType) == 2) { // is 16-bit Intgemm type
float quantMult = cpu::integer::computeQuantMult<Type::intgemm16>(val);
// Hardware-specific conversions which allow to implement memory-mapping and avoid conversion at runtime
cpu::integer::passOrAbort(gemmElementType); // Check if the hardware supports the GEMM type
if(isSse2(gemmElementType)) {
intgemm::SSE2::Kernels16::PrepareBTransposed(tmp->data(), /*input*/
paramMat->data<int16_t>(), /*output*/
quantMult, /*Quant Mult*/
rows(val),
cols(val));
} else if(isAvx2(gemmElementType)) {
intgemm::AVX2::Kernels16::PrepareBTransposed(tmp->data(), /*input*/
paramMat->data<int16_t>(), /*output*/
quantMult, /*Quant Mult*/
rows(val),
cols(val));
} else if(isAvx512(gemmElementType)) {
intgemm::AVX512BW::Kernels16::PrepareBTransposed(tmp->data(), /*input*/
paramMat->data<int16_t>(), /*output*/
quantMult, /*Quant Mult*/
rows(val),
cols(val));
} else {
ABORT_IF(gemmElementType != Type::intgemm16, "Type {} is not supported", gemmElementType); // shouldn't really happen, but let's make sure
intgemm::Int16::PrepareA(tmp->data(), /*input*/
paramMat->data<int16_t>(), /*output*/
quantMult, /*Quant Mult*/
rows(val),
cols(val));
}
//Put the quantMult at the back of the tensor
cpu::integer::getQuantMult<Type::intgemm16>(paramMat) = quantMult;
} else {
ABORT("Incorrect Intgemm type size: {}", sizeOf(gemmElementType));
}
//Save... Same as the fbgemm case
io::Item item;
item.name = pName;
item.shape = val->shape();
item.type = gemmElementType;
auto mem = paramMat->memory();
item.bytes.resize(mem->size());
copy(backend_, mem->data<char>(), mem->data<char>() + mem->size(), item.bytes.data());
ioItems.emplace_back(std::move(item));
#else
ABORT("Packed type {} only supported when compiled with -DCOMPILE_CPU=on", gemmElementType);
#endif
} else {
ABORT_IF(saveElementType != Type::float32, "We currently do not know how to save matrices as {}", saveElementType);
io::Item item;
val->get(item, pName);
item.convert(saveElementType);
ioItems.emplace_back(std::move(item));
}
}
// Now handle all non-float32 parameters
for(auto& iter : paramsByElementType_) {
auto type = iter.first;
if(type == Type::float32)
continue;
for (auto p : iter.second->getMap()) {
std::string pName = p.first;
LOG(info, "Processing parameter {} with shape {} and type {}", pName, p.second->shape(), p.second->value_type());
if (!namespace_.empty()) {
if (pName.substr(0, namespace_.size() + 2) == namespace_ + "::")
pName = pName.substr(namespace_.size() + 2);
}
Tensor val = p.second->val();
io::Item item;
val->get(item, pName);
ioItems.emplace_back(std::move(item));
}
}
return ioItems;
}
void packAndSave(const std::string& name, const std::string& meta, Type gemmElementType = Type::float32, Type saveElementType = Type::float32) {
auto ioItems = pack(gemmElementType, saveElementType);
if (!meta.empty())
io::addMetaToItems(meta, "special:model.yml", ioItems);
io::saveItems(name, ioItems);
}
};
} // namespace marian