Program Listing for File optimizers.cpp

Return to documentation for file (src/optimizers/optimizers.cpp)

#include "optimizers.h"

#include "common/io.h"
#include "tensors/tensor_operators.h"
#include <array>

namespace marian {

float OptimizerBase::update(Tensor params, Tensor grads, size_t mbSize, float costScaleFactor) {
  int elements = (int)params->size();

  LOG_ONCE(info, "Parameter type {}, optimization type {}, casting types {}",
           params->type(), optimizerType_, castOptimizerType_);

  int numAllocateShards = 0;
  if(mvAvg_) numAllocateShards += 1; // one shard for exp smoothing
  if(castOptimizerType_) numAllocateShards += 2; // two shards for conversion

  // allocate storage for shards
  if(numAllocateShards > 0 && !baseAlloc_) {
    LOG_ONCE(info, "Allocating memory for general optimizer shards");
    baseAlloc_ = New<TensorAllocator>(params->getBackend());
    baseAlloc_->reserveExact(std::vector<size_t>(numAllocateShards, elements * sizeOf(optimizerType_)));
  }

  if(mvAvg_ && !avg_) {
    // allocate exp smooth shard tensor
    baseAlloc_->allocate(avg_, {1, elements}, optimizerType_);
    // initialize from parameters, this will be overwritten by checkpoint data if a checkpoint is found or by the first update.
    // If we resume training with no checkpoint this initialization will survive and be the basis for further averaging, which is
    // what we want in that slightly pathological circumstance.
    CopyCast(avg_, params);
  }

  if(castOptimizerType_) {
    if(!pm_) {
      // create parameter master copy and temporary gradient shard
      baseAlloc_->allocate(pm_, {1, elements}, optimizerType_);
      baseAlloc_->allocate(gd_, {1, elements}, optimizerType_);

      // keep parameter master copy around and initialize once, converting types
      CopyCast(pm_, params);
    }
  } else {
    // no conversion, just assign at each update
    pm_ = params;
  }

  if(!alloc_) {
    size_t size = pm_->memory()->size();
    alloc_ = New<Allocator>(pm_->getBackend()->getDeviceId(), size, size);
  }

  if(castOptimizerType_)
    CopyCast(gd_, grads);
  else
    gd_ = grads;

  // reverse cost scaling when used
  if(costScaleFactor != 1.f)
    Element(functional::_1 = functional::_1 / costScaleFactor, gd_);

  // clip gradients when used
  if(!clipper_) {
  #if 1 // @BUGBUG: when we changed to ce-sum we did not adapt gradient clipping. The norm now depends on mini-batch size, that is wrong. Keeping this for backcompat with regression tests. To be removed as soon as possible.
    float clipNorm = options_->get<float>("clip-norm", 0.f); // this is different than the dynamic scaling as it is an absolute upper limit
    if(clipNorm > 0.f) {
      clipper_ = New<NormClipper>(clipNorm);
    } else
  #endif
    {
      clipper_ = New<ReportNormClipper>(0.f); // don't clip, just report
    }

    // This is a bit magical.
    // Preallocate in order to avoid later reallocation: number of maximum GPU blocks times size of float plus some overhead.
    // This is not too critical and more an educated guess. If less memory is required we haven't lost much, if more is required
    // (unlikely) it will reallocate. The hope is to avoid GPU memory fragmentation.
    // @TODO: check if this actually does anything beneficial, e.g. throw at reallocation and check if that ever happens.
    size_t prealloc = 65535 * 4 + 1024;
    auto clipAlloc = New<Allocator>(pm_->getBackend()->getDeviceId(), /*bytes=*/prealloc, /*step=*/1024);
    clipper_->setAllocator(clipAlloc);
  }
  float gNorm = clipper_->clip(gd_); // clip or rescale, report norm from before clipping

  // perform update on master copy with cast gradients
  // if a type cast has been performed. Otherwise the
  // original tensors are used.
  updateImpl(pm_, gd_, mbSize);

  // if exponential smoothing is used update the average
  if(mvAvg_)
    updateAvgParams(avg_, pm_, batchesSeen_, mbSize);

  // undo paramter type cast if required
  if(castOptimizerType_)
    CopyCast(params, pm_);

  params->getBackend()->synchronize();

  return gNorm;
}

void OptimizerBase::swapWithSmoothed(Tensor params) {
  if(!mvAvg_) // no smoothing, don't do anything
    return;

  // This assumes that two swaps are going to happen eventually.
  if(castOptimizerType_) {
    // If true then optimizer type is different from the graph type,
    // hence a parameter master copy exists and we swap with the master copy.
    // We then from optimizer parameter type to graph parameter type
    pm_->swap(avg_);
    CopyCast(params, pm_);
  } else {
    // Types are equal hence there is no parameter master copy. This means
    // we need to do a proper swap between the graph params and the smoothed
    // version. We will then swap again with the next call restoring original
    // parameters.
    params->swap(avg_);
  }
}

void OptimizerBase::load(std::vector<io::Item>& items,
                         const std::vector<Ptr<OptimizerBase>>& opts,
                         const std::vector<Ptr<Backend>>& backends,
                         const ScatterStateFunc& scatterFn,
                         bool isMainProcess) {
  isMainProcess;
  ABORT_IF(opts.size() != backends.size(), "opts and backends of different sizes??");

  size_t numShards = 0;
  if(mvAvg_) numShards += 1;
  if(castOptimizerType_) numShards += 2;

  if(castOptimizerType_) {
    io::Item iParams;
    for(auto item : items)
      if(item.name == "master_parameters")
        iParams = std::move(item);

    if(iParams.bytes.empty()) {
      LOG(warn, "[warn] Parameters not found in .npz file");
    } else {
      ABORT_IF(optimizerType_ != iParams.type,
               "Current ({}) and previous ({}) optimization type do not match",
               optimizerType_,
               iParams.type);

      scatterFn(iParams,
        [&](size_t localDeviceIndex, const char* begin, const char* end) {
          auto opt = opts[localDeviceIndex];
          if(!opt->pm_) { // lazily allocate
            size_t size = end - begin;  // this is size in bytes now
            if(!opt->baseAlloc_) {
              LOG_ONCE(info, "Allocating memory for general optimizer shards");
              opt->baseAlloc_ = New<TensorAllocator>(backends[localDeviceIndex]);
              opt->baseAlloc_->reserveExact(std::vector<size_t>(numShards, size));
            }
            int elements = (int)size / (int)sizeOf(iParams.type);
            opt->baseAlloc_->allocate(opt->pm_, {1, elements}, iParams.type);
            opt->baseAlloc_->allocate(opt->gd_, {1, elements}, iParams.type);
          }
          opt->pm_->set(begin, end, iParams.type); // set the value
        });
    }
  }

  if(mvAvg_) {
    io::Item iAvg;
    for(auto item : items)
      if(item.name == "exp_smoothing")
        iAvg = std::move(item);

    if(iAvg.bytes.empty()) {
      LOG(warn, "[warn] Average not found in .npz file");
    } else {
      ABORT_IF(optimizerType_ != iAvg.type,
          "Current ({}) and previous ({}) optimization type do not match",
          optimizerType_,
          iAvg.type);

      scatterFn(iAvg,
        [&](size_t localDeviceIndex, const char* begin, const char* end) {
          auto opt = opts[localDeviceIndex];
          if(!opt->avg_) { // lazily allocate
            size_t size = end - begin;  // this is size in bytes now
            if(!opt->baseAlloc_) {
              LOG_ONCE(info, "Allocating memory for general optimizer shards");
              opt->baseAlloc_ = New<TensorAllocator>(backends[localDeviceIndex]);
              opt->baseAlloc_->reserveExact(std::vector<size_t>(numShards, size));
            }
            int elements = (int)size / (int)sizeOf(iAvg.type);
            opt->baseAlloc_->allocate(opt->avg_, {1, elements}, iAvg.type);
          }
          opt->avg_->set(begin, end, iAvg.type); // set the value
        });
    }
  }
}

void OptimizerBase::save(std::vector<io::Item>& items,
                         const std::vector<Ptr<OptimizerBase>>& opts,
                         const GatherStateFunc& gatherFn,
                         bool isMainProcess) {
  isMainProcess;
  if(castOptimizerType_) {
    // fetch and concatenate state vectors for high precision copy
    io::Item pm = gatherFn(
      [&](size_t localDeviceIndex) {
        auto opt = opts[localDeviceIndex];
        io::Item item;
        opt->pm_->get(item, "master_parameters");
        return item;
      });
    items.emplace_back(std::move(pm));
  }
  if(mvAvg_) {
    // fetch and concatenate state vectors for smoothed parameters
    io::Item avg = gatherFn(
      [&](size_t localDeviceIndex) {
        auto opt = opts[localDeviceIndex];
        io::Item item;
        opt->avg_->get(item, "exp_smoothing");
        return item;
      });
    items.emplace_back(std::move(avg));
  }
}

void Sgd::updateImpl(Tensor params, Tensor grads, size_t actualMBSize) {
  actualMBSize; // (no correction for base update needed beyond using ce-sum)
  using namespace functional;
  Element(_1 -= eta_ * _2,
          params,
          grads);
}

void Sgd::load(std::vector<io::Item>& items,
               const std::vector<Ptr<OptimizerBase>>& opts,
               const std::vector<Ptr<Backend>>& backends,
               const ScatterStateFunc& scatterFn,
               bool isMainProcess) {
  OptimizerBase::load(items, opts, backends, scatterFn, isMainProcess);
}

void Sgd::save(std::vector<io::Item>& items,
               const std::vector<Ptr<OptimizerBase>>& opts,
               const GatherStateFunc& gatherFn,
               bool isMainProcess) {
  OptimizerBase::save(items, opts, gatherFn, isMainProcess); // collect parameters from base
}


// Adagrad
void Adagrad::updateImpl(Tensor params, Tensor grads, size_t actualMBSize) {
  actualMBSize; // not used in Adagrad

  // allocate optimizer-specific parameters
  if(!alloc_) {
    LOG_ONCE(info, "Allocating memory for Adagrad-specific shards");
    alloc_ = New<TensorAllocator>(params->getBackend());
  }

  if(!gt_) {
    int elements = (int)params->size();
    alloc_->reserveExact(params->memory()->size());
    alloc_->allocate(gt_, {1, elements}, params->type());
    gt_->set(0.f);
  }

  using namespace functional;

  Element(_1 += (_2 * _2), gt_, grads);

  // make sure eps_ does not drop below smallest (positive) value, add some reserve by multiplying with 2
  eps_ = (float)std::max(NumericLimits<double>(params->type()).min * 2.f, (double)eps_);
  Element(_1 -= (eta_ / (sqrt(_2) + eps_)) * _3,
          params,
          gt_,
          grads);
}

void Adagrad::load(std::vector<io::Item>& items,
                   const std::vector<Ptr<OptimizerBase>>& opts,
                   const std::vector<Ptr<Backend>>& backends,
                   const ScatterStateFunc& scatterFn,
                   bool isMainProcess) {
  OptimizerBase::load(items, opts, backends, scatterFn, isMainProcess);

  if(isMainProcess)
    LOG(info, "Loading Adagrad parameters");

  io::Item iGt;
  for(auto item : items)
    // extract data into vectors
    if(item.name == "adagrad_gt")
      iGt = std::move(item);

  if(iGt.bytes.empty()) {
    LOG(warn, "[warn] Adagrad parameters not found in checkpoint");
    return;
  }

  ABORT_IF(optimizerType_ != iGt.type,
          "Current ({}) and previous ({}) optimization type do not match",
          optimizerType_,
          iGt.type);

  scatterFn(iGt,
    [&](size_t localDeviceIndex, const char* begin, const char* end) {
      auto opt = std::dynamic_pointer_cast<Adagrad>(opts[localDeviceIndex]);
      if(!opt->gt_) {
        if(!opt->alloc_)
          opt->alloc_ = New<TensorAllocator>(backends[localDeviceIndex]);

        size_t size = end - begin; // this is size in bytes now
        int elements = (int)size / (int)sizeOf(iGt.type);
        opt->alloc_->reserveExact(size);
        opt->alloc_->allocate(opt->gt_, {1, elements}, iGt.type);
      }

      opt->gt_->set(begin, end, iGt.type);
    });
}

void Adagrad::save(std::vector<io::Item>& items,
                   const std::vector<Ptr<OptimizerBase>>& opts,
                   const GatherStateFunc& gatherFn,
                   bool isMainProcess) {
  OptimizerBase::save(items, opts, gatherFn, isMainProcess); // collect parameters from base

  if(isMainProcess)
    LOG(info, "Saving Adagrad parameters");

  // fetch and concatenate state vectors from distributed shards into a CPU-side vector
  io::Item gt = gatherFn(
    [&](size_t localDeviceIndex) {
      auto opt = std::dynamic_pointer_cast<Adagrad>(opts[localDeviceIndex]);
      io::Item item;
      opt->gt_->get(item, "adagrad_gt");
      return item;
    });
  items.emplace_back(std::move(gt));
}

void Adagrad::resetStats() {
  if(gt_)
    gt_->set(0.f);
}

// Adam
void Adam::updateImpl(Tensor params, Tensor grads, size_t actualMBSize) {
  // lazy allocation
  if(!alloc_) {
    LOG_ONCE(info, "Allocating memory for Adam-specific shards");
    alloc_ = New<TensorAllocator>(params->getBackend());
  }

  if(!mt_) {
    int elements = (int)params->size();
    size_t shard = (size_t)elements * sizeOf(params->type());
    alloc_->reserveExact({shard, shard});

    alloc_->allocate(mt_, {1, elements}, params->type());
    mt_->set(0.f);

    alloc_->allocate(vt_, {1, elements}, params->type());
    vt_->set(0.f);
  }

  double T = 1, Tref = 1;
  if(OptimizerBase::refMBWordsParam_ > 0) {
    T = (double)actualMBSize;
    if(actualMBSize > refBatchTrgWords_)
      Tref = (double)refMBWordsParam_;
    else
      Tref = T;
  }

  // adjust for minibatch-size changes if Adam parameters are given a reference size (else do nothing)
  // Why the T/Tref factor on eta? The Adam optimizer adds an RMS-normalized gradient
  // value (times learning rate) to the model. We know that for Tref, that learning rate is good.
  // If we increase the batch size by (T/Tref), then without adjustment, we would still add an
  // RMS-normalized gradient value. That means that the contribution of an individual label is
  // now weighted down by (T/Tref). However, batch-size agnostic hyper-parameterization aims to keep
  // the weight on the contribution of each label gradient invariant. Thus, we must undo that
  // down-weighting, by multiplying the RMS-normalized gradient value by an additional factor
  // of (T/Tref). This is implemented here by locally multiplying the learning rate
  // with that factor.
  double eta   = eta_ * (T / Tref);
  double beta1 = beta1_;
  double beta2 = beta2_;
  double decay = w_    ;

  // denominators. At steady state: =1. This recursion does the same as the Adam beta correction term.
  denom1_ = (beta1 * denom1_) + (1 - beta1); // momentum smoothing
  denom2_ = (beta2 * denom2_) + (1 - beta2); // RMS normalization

  // numerators. Divide by T to convert ce-sum gradient to avg gradient.
  using namespace functional;
#if 0 // why the division by T or T^2 here? It's T=1 without mb-ref anyway and we have the adjustment above, also converges a lot(!) slower with T != 1
  Element(_1 = ((float)beta1 * _1) + float((1 - beta1) / T    ) *  _2,       mt_, grads); // momentum smoothing. At steady state: =smoothed avg gradient
  Element(_1 = ((float)beta2 * _1) + float((1 - beta2) / T / T) * (_2 * _2), vt_, grads); // RMS normalization.  At steady state: =mean square of the avg gradients
#else
  Element(_1 = ((float)beta1 * _1) + float((1 - beta1)) *  _2,       mt_, grads); // momentum smoothing. At steady state: =smoothed avg gradient
  Element(_1 = ((float)beta2 * _1) + float((1 - beta2)) * (_2 * _2), vt_, grads); // RMS normalization.  At steady state: =mean square of the avg gradients
#endif

  // make sure eps_ does not drop below minimum value, this is important
  // when training with mixed precision. Otherwise we divide by 0.
  // We multiply the minimum by 2 in order to step away from the abyss.
  eps_ = std::max(NumericLimits<float>(params->type()).min * 2.f, eps_);

  // make sure eps_ does not drop below minimum value, this is important
  // when training with mixed precision. Otherwise we divide by 0.
  // We multiply the minimum by 2 in order to step away from the abyss.
  eps_ = std::max(NumericLimits<float>(params->type()).min * 2.f, eps_);

  // apply Adam normalization
  float etaf = (float)eta, denom1f = (float)denom1_, denom2f = (float)denom2_, decayf = (float)decay; // (get casts out of Element expression for readability)
  Element(_1 -= etaf                               // learning-rate: x_t = x_{t-1} - \eta * (...)
                * ((  (     _2 / denom1f)          // momentum-smoothed per-sample gradient: m_{t-1}
                    / (sqrt(_3 / denom2f) + eps_)) // normalize by RMS: \sqrt(v_{t-1})
                   + (decayf * _1)),                 // weight-decay: w * x_{t-1}
          params,  // =_1
          mt_,     // =_2
          vt_      // =_3
          );
}

void Adam::load(std::vector<io::Item>& items,
                const std::vector<Ptr<OptimizerBase>>& opts,
                const std::vector<Ptr<Backend>>& backends,
                const ScatterStateFunc& scatterFn,
                bool isMainProcess) {
  OptimizerBase::load(items, opts, backends, scatterFn, isMainProcess);

  if(isMainProcess)
    LOG(info, "Loading Adam parameters");

  io::Item iMt;
  io::Item iVt;
  std::array<double, 2> vDenoms;

  for(auto item : items) {
    // extract data into vectors
    if(item.name == "adam_mt") {
      iMt = std::move(item);
    } else if(item.name == "adam_vt") {
      iVt = std::move(item);
    } else if(item.name == "adam_denoms") {
      ABORT_IF(item.size() != 2 * sizeof(double), "adam_denoms should have 2 entries not {} bytes", item.size());
      std::copy((double*)item.data(), ((double*)item.data()) + 2, vDenoms.begin());
      // Back compat note: Old files lacked "adam_denoms". For those, vDenoms will remain 0, which reproduces the old behavior.
    }
  }

  if(iMt.bytes.empty() || iVt.bytes.empty()) {
    LOG(warn, "[warn] Adam parameters not found in .npz file");
    return;
  }

  ABORT_IF(optimizerType_ != iMt.type,
           "Current ({}) and previous ({}) optimization type do not match",
           optimizerType_,
           iMt.type);

  ABORT_IF(iMt.size() != iVt.size(), "mt and vt have different sizes??");

  scatterFn(iMt,
    [&](size_t localDeviceIndex, const char* begin, const char* end) {
      auto opt = std::dynamic_pointer_cast<Adam>(opts[localDeviceIndex]);

      // denominators need to be set in all shards, hijack this scatter
      opt->denom1_ = vDenoms[0];
      opt->denom2_ = vDenoms[1];

      if(!opt->mt_ || !opt->vt_) { // lazily allocate
        if(!opt->alloc_)
          opt->alloc_ = New<TensorAllocator>(backends[localDeviceIndex]);
        size_t size = end - begin;  // this is size in bytes now
        int elements = (int)size / (int)sizeOf(iMt.type);
        opt->alloc_->reserveExact(2 * size);
        opt->alloc_->allocate(opt->mt_, {1, elements}, iMt.type);
        opt->alloc_->allocate(opt->vt_, {1, elements}, iMt.type);
      }
      opt->mt_->set(begin, end, iMt.type); // set the value
    });

  scatterFn(iVt,
    [&](size_t localDeviceIndex, const char* begin, const char* end) {
      auto opt = std::dynamic_pointer_cast<Adam>(opts[localDeviceIndex]);
      opt->vt_->set(begin, end, iVt.type);
    });
}

void Adam::save(std::vector<io::Item>& items,
                const std::vector<Ptr<OptimizerBase>>& opts,
                const GatherStateFunc& gatherFn,
                bool isMainProcess) {
  OptimizerBase::save(items, opts, gatherFn, isMainProcess); // collect parameters from base

  if(isMainProcess)
    LOG(info, "Saving Adam parameters");

  // fetch and concatenate state vectors from distributed shards into a CPU-side vector
  io::Item mt = gatherFn(
    [&](size_t localDeviceIndex) {
      auto opt = std::dynamic_pointer_cast<Adam>(opts[localDeviceIndex]);
      io::Item item;
      opt->mt_->get(item, "adam_mt");
      return item;
    });
  items.emplace_back(std::move(mt));

  io::Item vt = gatherFn(
    [&](size_t localDeviceIndex) {
      auto opt = std::dynamic_pointer_cast<Adam>(opts[localDeviceIndex]);
      io::Item item;
      opt->vt_->get(item, "adam_vt");
      return item;
    });
  items.emplace_back(std::move(vt));

  std::vector<double> vDenoms{denom1_, denom2_};
  items.emplace_back(io::fromVector(vDenoms, "adam_denoms"));
}

void Adam::resetStats() {
  if(mt_)
    mt_->set(0.f);

  if(vt_)
    vt_->set(0.f);

  denom1_ = 0; // @BUGBUG: or 1 or refMBWords if so specified. Fix once we have proper parameterization for that.
  denom2_ = 0;
}

Ptr<OptimizerBase> Optimizer(Ptr<Options> options) {
  auto optType = options->get<std::string>("optimizer");
  auto params = options->has("optimizer-params")
                     ? options->get<std::vector<float>>("optimizer-params")
                     : std::vector<float>({});
  Ptr<OptimizerBase> opt;
  if(optType == "sgd") {
    opt = New<Sgd>(options);
  } else if(optType == "adagrad") {
    opt = New<Adagrad>(options);
  } else if(optType == "adam") {
    opt = New<Adam>(options);
  } else {
    ABORT("Unknown optimizer type: {}", optType);
  }

  opt->setParams(params);
  return opt;
}

}  // namespace marian