Program Listing for File optimizers.h

Return to documentation for file (src/optimizers/optimizers.h)

#pragma once

#include "common/options.h"
#include "graph/expression_graph.h"
#include "optimizers/clippers.h"
#include "optimizers/exponential_smoothing.h"
#include "tensors/backend.h"
#include "tensors/tensor.h"
#include "training/training_state.h"

#include <algorithm>
#include <map>
#include <memory>

namespace marian {

class OptimizerBase : public TrainingObserver, public ExponentialSmoothing {
public:
  OptimizerBase(Ptr<Options> options)
  : ExponentialSmoothing(options),
    options_(options),
    eta_(options_->get<float>("learn-rate")),
    refMBWordsParam_(options_->get<size_t>("mini-batch-words-ref", 0)),
    normalizedGradient_{options_->get<bool>("normalize-gradient", false)} // @TODO: get rid of this if we manage to confirm that it does not help with fp16 training
  {

    auto precisions = options_->get<std::vector<std::string>>("precision", {"float32", "float32"});
    ABORT_IF(precisions.size() < 2, "No optimizer precision type specified??");

    auto paramType = typeFromString(precisions[0]);
    optimizerType_ = typeFromString(precisions[1]);

    // if true model for forward/backward uses a different type than the optimizer
    castOptimizerType_ = paramType != optimizerType_;

    // automatic learning-rate adjustment
    // If users provide, in addition to the hyper-parameters, a reference minibatch size,
    // that these hyper-parameters were originally tuned for, then the learning-rate gets
    // adjusted accordingly. Note: Requires user to also use ce-sum criterion.
    if (refMBWordsParam_ != 0)
      LOG_ONCE(info, "[optimizers] Learning rate gets automatically adjusted as if minibatch size was {}", refMBWordsParam_);
  }

  virtual ~OptimizerBase() {}

  float update(Ptr<ExpressionGraph> graph, size_t mbSize, float costScaleFactor = 1.f) {
    Tensor p = graph->params()->vals();
    Tensor g = graph->params()->grads();

    return update(p, g, mbSize, costScaleFactor);
  }

  float update(Tensor params, Tensor grads, size_t mbSize, float costScaleFactor = 1.f);

  virtual void init(TrainingState& state) override {
    eta_ = state.eta;
    batchesSeen_ = state.batches;
  }

  virtual void actAfterLoaded(TrainingState& state) override {
    eta_ = state.eta;
    batchesSeen_ = state.batches;
  }

  virtual void actAfterEpoch(TrainingState& state) override {
    eta_ = state.eta;
    batchesSeen_ = state.batches;

    if(state.reset)
      resetStats();
  }

  virtual void actAfterBatches(TrainingState& state) override {
    eta_ = state.eta;
    batchesSeen_ = state.batches;

    if(state.reset)
      resetStats();
  }

  virtual void actAfterStalled(TrainingState& state) override {
    eta_ = state.eta;
    batchesSeen_ = state.batches;

    if(state.reset)
      resetStats();
  }

  virtual void setParams(const std::vector<float>& params) = 0;

  typedef std::function<void(size_t /*localDeviceIndex*/,
                             const char* /*begin*/,
                             const char* /*end*/)> ScatterStateSetFunc;
  typedef std::function<io::Item(size_t /*localDeviceIndex*/)> GatherStateGetFunc;

  typedef std::function<void(const io::Item& /*data*/, const ScatterStateSetFunc& /*setFn*/)> ScatterStateFunc;

  typedef std::function<io::Item(const GatherStateGetFunc& /*getFn*/)> GatherStateFunc;

  virtual void load(std::vector<io::Item>& /*items*/,
                    const std::vector<Ptr<OptimizerBase>>& /*opts*/,
                    const std::vector<Ptr<Backend>>& /*backends*/,
                    const ScatterStateFunc& /*scatterFn*/,
                    bool isMainProcess);

  virtual void save(std::vector<io::Item>& /*items*/,
                    const std::vector<Ptr<OptimizerBase>>& /*opts*/,
                    const GatherStateFunc& /*gatherFn*/,
                    bool isMainProcess);

  // This function swaps out the current optimizer parameters with the smoothed version (provided smoothing is enabled).
  // Usually we will call this twice, to swap in and to swap out.
  void swapWithSmoothed(Tensor params);

  // return stateful optimizer shards, for base that's only averaged parameters
  virtual std::vector<Tensor> getShards() {
    if(avg_)
      return { avg_ };
    else
      return { };
  }

protected:
  virtual void updateImpl(Tensor params, Tensor grads, size_t actualMBSize) = 0;
  virtual void resetStats() = 0;

  Ptr<Options> options_;

  float eta_;                      // Learning rate
  size_t refMBWordsParam_{0};      // reference MB size. This enables automatic adjustment of optimizer hyper-parameters to MB size. 0 means no adjustment
  size_t batchesSeen_{0};          // updates seen so far
  bool normalizedGradient_{false}; // has the gradient been normalized by MB size? @TODO: get rid of this if we manage to confirm that it does not help with fp16 training

  Type optimizerType_{Type::float32};
  bool castOptimizerType_{false};

  Ptr<Clipper> clipper_;  // Clip gradient norm

  Ptr<TensorAllocator> baseAlloc_;
  Ptr<Allocator> alloc_;

  Tensor avg_;

  Tensor pm_;
  Tensor gd_;
};

class Sgd : public OptimizerBase {
public:
  Sgd(Ptr<Options> options) : OptimizerBase(options) {}

  void load(std::vector<io::Item>& /*items*/,
            const std::vector<Ptr<OptimizerBase>>& /*opts*/,
            const std::vector<Ptr<Backend>>& /*backends*/,
            const ScatterStateFunc& /*scatterFn*/,
            bool isMainProcess) override;

  void save(std::vector<io::Item>& items,
            const std::vector<Ptr<OptimizerBase>>& opts,
            const GatherStateFunc& gatherFn,
            bool isMainProcess) override;

  virtual void setParams(const std::vector<float>& /*params*/) override {}
private:
  void updateImpl(Tensor params, Tensor grads, size_t actualMBSize) override;

  virtual void resetStats() override {}
};

class Adagrad : public OptimizerBase {
public:
  Adagrad(Ptr<Options> options) : OptimizerBase(options) {}

  void load(std::vector<io::Item>& /*items*/,
            const std::vector<Ptr<OptimizerBase>>& /*opts*/,
            const std::vector<Ptr<Backend>>& /*backends*/,
            const ScatterStateFunc& /*scatterFn*/,
            bool isMainProcess) override;

  void save(std::vector<io::Item>& items,
            const std::vector<Ptr<OptimizerBase>>& opts,
            const GatherStateFunc& gatherFn,
            bool isMainProcess) override;

  void setParams(const std::vector<float>& params) override {
    if(params.size() > 0)
      eps_ = params[0];
  }

  std::vector<Tensor> getShards() override {
    auto shards = OptimizerBase::getShards();
    shards.push_back(gt_);
    return shards;
  }

private:
  void updateImpl(Tensor params, Tensor grads, size_t actualMBSize) override;
  void resetStats() override;

  float eps_ = 1e-8f;
  Ptr<TensorAllocator> alloc_;
  Tensor gt_;
};

class Adam : public OptimizerBase {
public:
  Adam(Ptr<Options> options) : OptimizerBase(options) {}

  void load(std::vector<io::Item>& /*items*/,
            const std::vector<Ptr<OptimizerBase>>& /*opts*/,
            const std::vector<Ptr<Backend>>& /*backends*/,
            const ScatterStateFunc& /*scatterFn*/,
            bool isMainProcess) override;

  void save(std::vector<io::Item>& items,
            const std::vector<Ptr<OptimizerBase>>& opts,
            const GatherStateFunc& gatherFn,
            bool isMainProcess) override;

  std::vector<Tensor> getShards() override {
    auto shards = OptimizerBase::getShards();
    shards.push_back(mt_);
    shards.push_back(vt_);
    return shards;
  }

private:
  void updateImpl(Tensor params, Tensor grads, size_t actualMBSize) override;
  void resetStats() override;

  // Adam parameters:
  // [beta1, beta2, eps, w, refMBWords]
  virtual void setParams(const std::vector<float>& params) override {
    if(params.size() > 0)
      beta1_ = params[0];
    if(params.size() > 1)
      beta2_ = params[1];
    if(params.size() > 2)
      eps_ = params[2];

    // weighted decay for AdamW, to be explored, disabled by default
    if(params.size() > 3)
      w_ = params[3]; // default (disabled): 0
  }

  // hyper-parameters
  float beta1_ = 0.9f;
  float beta2_ = 0.999f;
  float eps_ = 1e-8f;
  float w_ = 0.0f;

  // CPU-side running accumulators
  double denom1_ = 0;
  double denom2_ = 0;

  // GPU-side running accumulators
  Ptr<TensorAllocator> alloc_;
  Tensor mt_;
  Tensor vt_;
};

Ptr<OptimizerBase> Optimizer(Ptr<Options> options);
}  // namespace marian