.. _program_listing_file_src_optimizers_optimizers.cpp: Program Listing for File optimizers.cpp ======================================= |exhale_lsh| :ref:`Return to documentation for file ` (``src/optimizers/optimizers.cpp``) .. |exhale_lsh| unicode:: U+021B0 .. UPWARDS ARROW WITH TIP LEFTWARDS .. code-block:: cpp #include "optimizers.h" #include "common/io.h" #include "tensors/tensor_operators.h" #include namespace marian { float OptimizerBase::update(Tensor params, Tensor grads, size_t mbSize, float costScaleFactor) { int elements = (int)params->size(); LOG_ONCE(info, "Parameter type {}, optimization type {}, casting types {}", params->type(), optimizerType_, castOptimizerType_); int numAllocateShards = 0; if(mvAvg_) numAllocateShards += 1; // one shard for exp smoothing if(castOptimizerType_) numAllocateShards += 2; // two shards for conversion // allocate storage for shards if(numAllocateShards > 0 && !baseAlloc_) { LOG_ONCE(info, "Allocating memory for general optimizer shards"); baseAlloc_ = New(params->getBackend()); baseAlloc_->reserveExact(std::vector(numAllocateShards, elements * sizeOf(optimizerType_))); } if(mvAvg_ && !avg_) { // allocate exp smooth shard tensor baseAlloc_->allocate(avg_, {1, elements}, optimizerType_); // initialize from parameters, this will be overwritten by checkpoint data if a checkpoint is found or by the first update. // If we resume training with no checkpoint this initialization will survive and be the basis for further averaging, which is // what we want in that slightly pathological circumstance. CopyCast(avg_, params); } if(castOptimizerType_) { if(!pm_) { // create parameter master copy and temporary gradient shard baseAlloc_->allocate(pm_, {1, elements}, optimizerType_); baseAlloc_->allocate(gd_, {1, elements}, optimizerType_); // keep parameter master copy around and initialize once, converting types CopyCast(pm_, params); } } else { // no conversion, just assign at each update pm_ = params; } if(!alloc_) { size_t size = pm_->memory()->size(); alloc_ = New(pm_->getBackend()->getDeviceId(), size, size); } if(castOptimizerType_) CopyCast(gd_, grads); else gd_ = grads; // reverse cost scaling when used if(costScaleFactor != 1.f) Element(functional::_1 = functional::_1 / costScaleFactor, gd_); // clip gradients when used if(!clipper_) { #if 1 // @BUGBUG: when we changed to ce-sum we did not adapt gradient clipping. The norm now depends on mini-batch size, that is wrong. Keeping this for backcompat with regression tests. To be removed as soon as possible. float clipNorm = options_->get("clip-norm", 0.f); // this is different than the dynamic scaling as it is an absolute upper limit if(clipNorm > 0.f) { clipper_ = New(clipNorm); } else #endif { clipper_ = New(0.f); // don't clip, just report } // This is a bit magical. // Preallocate in order to avoid later reallocation: number of maximum GPU blocks times size of float plus some overhead. // This is not too critical and more an educated guess. If less memory is required we haven't lost much, if more is required // (unlikely) it will reallocate. The hope is to avoid GPU memory fragmentation. // @TODO: check if this actually does anything beneficial, e.g. throw at reallocation and check if that ever happens. size_t prealloc = 65535 * 4 + 1024; auto clipAlloc = New(pm_->getBackend()->getDeviceId(), /*bytes=*/prealloc, /*step=*/1024); clipper_->setAllocator(clipAlloc); } float gNorm = clipper_->clip(gd_); // clip or rescale, report norm from before clipping // perform update on master copy with cast gradients // if a type cast has been performed. Otherwise the // original tensors are used. updateImpl(pm_, gd_, mbSize); // if exponential smoothing is used update the average if(mvAvg_) updateAvgParams(avg_, pm_, batchesSeen_, mbSize); // undo paramter type cast if required if(castOptimizerType_) CopyCast(params, pm_); params->getBackend()->synchronize(); return gNorm; } void OptimizerBase::swapWithSmoothed(Tensor params) { if(!mvAvg_) // no smoothing, don't do anything return; // This assumes that two swaps are going to happen eventually. if(castOptimizerType_) { // If true then optimizer type is different from the graph type, // hence a parameter master copy exists and we swap with the master copy. // We then from optimizer parameter type to graph parameter type pm_->swap(avg_); CopyCast(params, pm_); } else { // Types are equal hence there is no parameter master copy. This means // we need to do a proper swap between the graph params and the smoothed // version. We will then swap again with the next call restoring original // parameters. params->swap(avg_); } } void OptimizerBase::load(std::vector& items, const std::vector>& opts, const std::vector>& backends, const ScatterStateFunc& scatterFn, bool isMainProcess) { isMainProcess; ABORT_IF(opts.size() != backends.size(), "opts and backends of different sizes??"); size_t numShards = 0; if(mvAvg_) numShards += 1; if(castOptimizerType_) numShards += 2; if(castOptimizerType_) { io::Item iParams; for(auto item : items) if(item.name == "master_parameters") iParams = std::move(item); if(iParams.bytes.empty()) { LOG(warn, "[warn] Parameters not found in .npz file"); } else { ABORT_IF(optimizerType_ != iParams.type, "Current ({}) and previous ({}) optimization type do not match", optimizerType_, iParams.type); scatterFn(iParams, [&](size_t localDeviceIndex, const char* begin, const char* end) { auto opt = opts[localDeviceIndex]; if(!opt->pm_) { // lazily allocate size_t size = end - begin; // this is size in bytes now if(!opt->baseAlloc_) { LOG_ONCE(info, "Allocating memory for general optimizer shards"); opt->baseAlloc_ = New(backends[localDeviceIndex]); opt->baseAlloc_->reserveExact(std::vector(numShards, size)); } int elements = (int)size / (int)sizeOf(iParams.type); opt->baseAlloc_->allocate(opt->pm_, {1, elements}, iParams.type); opt->baseAlloc_->allocate(opt->gd_, {1, elements}, iParams.type); } opt->pm_->set(begin, end, iParams.type); // set the value }); } } if(mvAvg_) { io::Item iAvg; for(auto item : items) if(item.name == "exp_smoothing") iAvg = std::move(item); if(iAvg.bytes.empty()) { LOG(warn, "[warn] Average not found in .npz file"); } else { ABORT_IF(optimizerType_ != iAvg.type, "Current ({}) and previous ({}) optimization type do not match", optimizerType_, iAvg.type); scatterFn(iAvg, [&](size_t localDeviceIndex, const char* begin, const char* end) { auto opt = opts[localDeviceIndex]; if(!opt->avg_) { // lazily allocate size_t size = end - begin; // this is size in bytes now if(!opt->baseAlloc_) { LOG_ONCE(info, "Allocating memory for general optimizer shards"); opt->baseAlloc_ = New(backends[localDeviceIndex]); opt->baseAlloc_->reserveExact(std::vector(numShards, size)); } int elements = (int)size / (int)sizeOf(iAvg.type); opt->baseAlloc_->allocate(opt->avg_, {1, elements}, iAvg.type); } opt->avg_->set(begin, end, iAvg.type); // set the value }); } } } void OptimizerBase::save(std::vector& items, const std::vector>& opts, const GatherStateFunc& gatherFn, bool isMainProcess) { isMainProcess; if(castOptimizerType_) { // fetch and concatenate state vectors for high precision copy io::Item pm = gatherFn( [&](size_t localDeviceIndex) { auto opt = opts[localDeviceIndex]; io::Item item; opt->pm_->get(item, "master_parameters"); return item; }); items.emplace_back(std::move(pm)); } if(mvAvg_) { // fetch and concatenate state vectors for smoothed parameters io::Item avg = gatherFn( [&](size_t localDeviceIndex) { auto opt = opts[localDeviceIndex]; io::Item item; opt->avg_->get(item, "exp_smoothing"); return item; }); items.emplace_back(std::move(avg)); } } void Sgd::updateImpl(Tensor params, Tensor grads, size_t actualMBSize) { actualMBSize; // (no correction for base update needed beyond using ce-sum) using namespace functional; Element(_1 -= eta_ * _2, params, grads); } void Sgd::load(std::vector& items, const std::vector>& opts, const std::vector>& backends, const ScatterStateFunc& scatterFn, bool isMainProcess) { OptimizerBase::load(items, opts, backends, scatterFn, isMainProcess); } void Sgd::save(std::vector& items, const std::vector>& opts, const GatherStateFunc& gatherFn, bool isMainProcess) { OptimizerBase::save(items, opts, gatherFn, isMainProcess); // collect parameters from base } // Adagrad void Adagrad::updateImpl(Tensor params, Tensor grads, size_t actualMBSize) { actualMBSize; // not used in Adagrad // allocate optimizer-specific parameters if(!alloc_) { LOG_ONCE(info, "Allocating memory for Adagrad-specific shards"); alloc_ = New(params->getBackend()); } if(!gt_) { int elements = (int)params->size(); alloc_->reserveExact(params->memory()->size()); alloc_->allocate(gt_, {1, elements}, params->type()); gt_->set(0.f); } using namespace functional; Element(_1 += (_2 * _2), gt_, grads); // make sure eps_ does not drop below smallest (positive) value, add some reserve by multiplying with 2 eps_ = (float)std::max(NumericLimits(params->type()).min * 2.f, (double)eps_); Element(_1 -= (eta_ / (sqrt(_2) + eps_)) * _3, params, gt_, grads); } void Adagrad::load(std::vector& items, const std::vector>& opts, const std::vector>& backends, const ScatterStateFunc& scatterFn, bool isMainProcess) { OptimizerBase::load(items, opts, backends, scatterFn, isMainProcess); if(isMainProcess) LOG(info, "Loading Adagrad parameters"); io::Item iGt; for(auto item : items) // extract data into vectors if(item.name == "adagrad_gt") iGt = std::move(item); if(iGt.bytes.empty()) { LOG(warn, "[warn] Adagrad parameters not found in checkpoint"); return; } ABORT_IF(optimizerType_ != iGt.type, "Current ({}) and previous ({}) optimization type do not match", optimizerType_, iGt.type); scatterFn(iGt, [&](size_t localDeviceIndex, const char* begin, const char* end) { auto opt = std::dynamic_pointer_cast(opts[localDeviceIndex]); if(!opt->gt_) { if(!opt->alloc_) opt->alloc_ = New(backends[localDeviceIndex]); size_t size = end - begin; // this is size in bytes now int elements = (int)size / (int)sizeOf(iGt.type); opt->alloc_->reserveExact(size); opt->alloc_->allocate(opt->gt_, {1, elements}, iGt.type); } opt->gt_->set(begin, end, iGt.type); }); } void Adagrad::save(std::vector& items, const std::vector>& opts, const GatherStateFunc& gatherFn, bool isMainProcess) { OptimizerBase::save(items, opts, gatherFn, isMainProcess); // collect parameters from base if(isMainProcess) LOG(info, "Saving Adagrad parameters"); // fetch and concatenate state vectors from distributed shards into a CPU-side vector io::Item gt = gatherFn( [&](size_t localDeviceIndex) { auto opt = std::dynamic_pointer_cast(opts[localDeviceIndex]); io::Item item; opt->gt_->get(item, "adagrad_gt"); return item; }); items.emplace_back(std::move(gt)); } void Adagrad::resetStats() { if(gt_) gt_->set(0.f); } // Adam void Adam::updateImpl(Tensor params, Tensor grads, size_t actualMBSize) { // lazy allocation if(!alloc_) { LOG_ONCE(info, "Allocating memory for Adam-specific shards"); alloc_ = New(params->getBackend()); } if(!mt_) { int elements = (int)params->size(); size_t shard = (size_t)elements * sizeOf(params->type()); alloc_->reserveExact({shard, shard}); alloc_->allocate(mt_, {1, elements}, params->type()); mt_->set(0.f); alloc_->allocate(vt_, {1, elements}, params->type()); vt_->set(0.f); } double T = 1, Tref = 1; if(OptimizerBase::refMBWordsParam_ > 0) { T = (double)actualMBSize; if(actualMBSize > refBatchTrgWords_) Tref = (double)refMBWordsParam_; else Tref = T; } // adjust for minibatch-size changes if Adam parameters are given a reference size (else do nothing) // Why the T/Tref factor on eta? The Adam optimizer adds an RMS-normalized gradient // value (times learning rate) to the model. We know that for Tref, that learning rate is good. // If we increase the batch size by (T/Tref), then without adjustment, we would still add an // RMS-normalized gradient value. That means that the contribution of an individual label is // now weighted down by (T/Tref). However, batch-size agnostic hyper-parameterization aims to keep // the weight on the contribution of each label gradient invariant. Thus, we must undo that // down-weighting, by multiplying the RMS-normalized gradient value by an additional factor // of (T/Tref). This is implemented here by locally multiplying the learning rate // with that factor. double eta = eta_ * (T / Tref); double beta1 = beta1_; double beta2 = beta2_; double decay = w_ ; // denominators. At steady state: =1. This recursion does the same as the Adam beta correction term. denom1_ = (beta1 * denom1_) + (1 - beta1); // momentum smoothing denom2_ = (beta2 * denom2_) + (1 - beta2); // RMS normalization // numerators. Divide by T to convert ce-sum gradient to avg gradient. using namespace functional; #if 0 // why the division by T or T^2 here? It's T=1 without mb-ref anyway and we have the adjustment above, also converges a lot(!) slower with T != 1 Element(_1 = ((float)beta1 * _1) + float((1 - beta1) / T ) * _2, mt_, grads); // momentum smoothing. At steady state: =smoothed avg gradient Element(_1 = ((float)beta2 * _1) + float((1 - beta2) / T / T) * (_2 * _2), vt_, grads); // RMS normalization. At steady state: =mean square of the avg gradients #else Element(_1 = ((float)beta1 * _1) + float((1 - beta1)) * _2, mt_, grads); // momentum smoothing. At steady state: =smoothed avg gradient Element(_1 = ((float)beta2 * _1) + float((1 - beta2)) * (_2 * _2), vt_, grads); // RMS normalization. At steady state: =mean square of the avg gradients #endif // make sure eps_ does not drop below minimum value, this is important // when training with mixed precision. Otherwise we divide by 0. // We multiply the minimum by 2 in order to step away from the abyss. eps_ = std::max(NumericLimits(params->type()).min * 2.f, eps_); // make sure eps_ does not drop below minimum value, this is important // when training with mixed precision. Otherwise we divide by 0. // We multiply the minimum by 2 in order to step away from the abyss. eps_ = std::max(NumericLimits(params->type()).min * 2.f, eps_); // apply Adam normalization float etaf = (float)eta, denom1f = (float)denom1_, denom2f = (float)denom2_, decayf = (float)decay; // (get casts out of Element expression for readability) Element(_1 -= etaf // learning-rate: x_t = x_{t-1} - \eta * (...) * (( ( _2 / denom1f) // momentum-smoothed per-sample gradient: m_{t-1} / (sqrt(_3 / denom2f) + eps_)) // normalize by RMS: \sqrt(v_{t-1}) + (decayf * _1)), // weight-decay: w * x_{t-1} params, // =_1 mt_, // =_2 vt_ // =_3 ); } void Adam::load(std::vector& items, const std::vector>& opts, const std::vector>& backends, const ScatterStateFunc& scatterFn, bool isMainProcess) { OptimizerBase::load(items, opts, backends, scatterFn, isMainProcess); if(isMainProcess) LOG(info, "Loading Adam parameters"); io::Item iMt; io::Item iVt; std::array vDenoms; for(auto item : items) { // extract data into vectors if(item.name == "adam_mt") { iMt = std::move(item); } else if(item.name == "adam_vt") { iVt = std::move(item); } else if(item.name == "adam_denoms") { ABORT_IF(item.size() != 2 * sizeof(double), "adam_denoms should have 2 entries not {} bytes", item.size()); std::copy((double*)item.data(), ((double*)item.data()) + 2, vDenoms.begin()); // Back compat note: Old files lacked "adam_denoms". For those, vDenoms will remain 0, which reproduces the old behavior. } } if(iMt.bytes.empty() || iVt.bytes.empty()) { LOG(warn, "[warn] Adam parameters not found in .npz file"); return; } ABORT_IF(optimizerType_ != iMt.type, "Current ({}) and previous ({}) optimization type do not match", optimizerType_, iMt.type); ABORT_IF(iMt.size() != iVt.size(), "mt and vt have different sizes??"); scatterFn(iMt, [&](size_t localDeviceIndex, const char* begin, const char* end) { auto opt = std::dynamic_pointer_cast(opts[localDeviceIndex]); // denominators need to be set in all shards, hijack this scatter opt->denom1_ = vDenoms[0]; opt->denom2_ = vDenoms[1]; if(!opt->mt_ || !opt->vt_) { // lazily allocate if(!opt->alloc_) opt->alloc_ = New(backends[localDeviceIndex]); size_t size = end - begin; // this is size in bytes now int elements = (int)size / (int)sizeOf(iMt.type); opt->alloc_->reserveExact(2 * size); opt->alloc_->allocate(opt->mt_, {1, elements}, iMt.type); opt->alloc_->allocate(opt->vt_, {1, elements}, iMt.type); } opt->mt_->set(begin, end, iMt.type); // set the value }); scatterFn(iVt, [&](size_t localDeviceIndex, const char* begin, const char* end) { auto opt = std::dynamic_pointer_cast(opts[localDeviceIndex]); opt->vt_->set(begin, end, iVt.type); }); } void Adam::save(std::vector& items, const std::vector>& opts, const GatherStateFunc& gatherFn, bool isMainProcess) { OptimizerBase::save(items, opts, gatherFn, isMainProcess); // collect parameters from base if(isMainProcess) LOG(info, "Saving Adam parameters"); // fetch and concatenate state vectors from distributed shards into a CPU-side vector io::Item mt = gatherFn( [&](size_t localDeviceIndex) { auto opt = std::dynamic_pointer_cast(opts[localDeviceIndex]); io::Item item; opt->mt_->get(item, "adam_mt"); return item; }); items.emplace_back(std::move(mt)); io::Item vt = gatherFn( [&](size_t localDeviceIndex) { auto opt = std::dynamic_pointer_cast(opts[localDeviceIndex]); io::Item item; opt->vt_->get(item, "adam_vt"); return item; }); items.emplace_back(std::move(vt)); std::vector vDenoms{denom1_, denom2_}; items.emplace_back(io::fromVector(vDenoms, "adam_denoms")); } void Adam::resetStats() { if(mt_) mt_->set(0.f); if(vt_) vt_->set(0.f); denom1_ = 0; // @BUGBUG: or 1 or refMBWords if so specified. Fix once we have proper parameterization for that. denom2_ = 0; } Ptr Optimizer(Ptr options) { auto optType = options->get("optimizer"); auto params = options->has("optimizer-params") ? options->get>("optimizer-params") : std::vector({}); Ptr opt; if(optType == "sgd") { opt = New(options); } else if(optType == "adagrad") { opt = New(options); } else if(optType == "adam") { opt = New(options); } else { ABORT("Unknown optimizer type: {}", optType); } opt->setParams(params); return opt; } } // namespace marian