Program Listing for File config_parser.cpp¶

↰ Return to documentation for file (src/common/config_parser.cpp)
#include "common/authors.h"
#include "common/build_info.h"
#include "common/cli_helper.h"
#include "common/config.h"
#include "common/config_parser.h"
#include "common/config_validator.h"
#include "common/definitions.h"
#include "common/file_stream.h"
#include "common/logging.h"
#include "common/options.h"
#include "common/regex.h"
#include "common/utils.h"
#include <algorithm>
#include <set>
#include <stdexcept>
#include <string>

#if MKL_FOUND
#include <mkl.h>
#else
#if BLAS_FOUND
#include <cblas.h>
#endif
#endif

namespace marian {

// TODO: Move this to CLIWrapper and allow to mark options as paths in the same place they are
// defined
// clang-format off
const std::set<std::string> PATHS = {
  "model",
  "models",
  "train-sets",
  "vocabs",
  "embedding-vectors",
  "valid-sets",
  "valid-script-path",
  "valid-script-args",
  "valid-log",
  "valid-translation-output",
  "input",            // except: 'stdin', handled in makeAbsolutePaths and interpolateEnvVars
  "output",           // except: 'stdout', handled in makeAbsolutePaths and interpolateEnvVars
  "pretrained-model",
  "data-weighting",
  "log",
  "sqlite",           // except: 'temporary', handled in the processPaths function
  "shortlist",        // except: only the first element in the sequence is a path, handled in the
                      //  processPaths function
};
// clang-format on

std::string escapeCmdLine(int argc, char** argv){
  std::string cmdLine;
  for(int i = 0; i < argc; i++) {
    std::string arg = argv[i];
    std::string quote; // attempt to quote special chars
    if(arg.empty() || arg.find_first_of(" #`\"'\\${}|&^?*!()%><") != std::string::npos)
      quote = "'";
    arg = regex::regex_replace(arg, regex::regex("'"), "'\\''");
    if(!cmdLine.empty())
      cmdLine.push_back(' ');
    cmdLine += quote + arg + quote;
  }
  return cmdLine;
}

std::string const& ConfigParser::cmdLine() const {
  return cmdLine_;
}

ConfigParser::ConfigParser(cli::mode mode)
  : cli_(config_,"Marian: Fast Neural Machine Translation in C++",
         "General options", "", 40),
    mode_(mode == cli::mode::server ? cli::mode::translation : mode) {

  addOptionsGeneral(cli_);
  if (mode == cli::mode::server)
    addOptionsServer(cli_);
  addOptionsModel(cli_);

  // clang-format off
  switch(mode_) {
    case cli::mode::training:
      addOptionsTraining(cli_);
      addOptionsValidation(cli_);
      break;
    case cli::mode::translation:
      addOptionsTranslation(cli_);
      break;
    case cli::mode::scoring:
      addOptionsScoring(cli_);
      break;
    case cli::mode::embedding:
      addOptionsEmbedding(cli_);
      break;
    default:
      ABORT("wrong CLI mode");
      break;
  }

  addAliases(cli_);
  // clang-format on
}

void ConfigParser::addOptionsGeneral(cli::CLIWrapper& cli) {
  int defaultWorkspace = (mode_ == cli::mode::translation) ? 512 : 2048;

  cli.switchGroup("General options");

  // clang-format off
  cli.add<bool>("--authors",
    "Print list of authors and exit");
  cli.add<bool>("--cite",
    "Print citation and exit");
  cli.add<std::string>("--build-info",
    "Print CMake build options and exit. Set to 'all' to print advanced options")
    ->implicit_val("basic");
  cli.add<std::vector<std::string>>("--config,-c",
    "Configuration file(s). If multiple, later overrides earlier");
  cli.add<size_t>("--workspace,-w",
    "Preallocate arg MB of work space",
    defaultWorkspace);
  cli.add<std::string>("--log",
    "Log training process information to file given by arg");
  cli.add<std::string>("--log-level",
    "Set verbosity level of logging: trace, debug, info, warn, err(or), critical, off",
    "info");
  cli.add<std::string>("--log-time-zone",
    "Set time zone for the date shown on logging");
  cli.add<bool>("--quiet",
    "Suppress all logging to stderr. Logging to files still works");
  cli.add<bool>("--quiet-translation",
    "Suppress logging for translation");
  cli.add<size_t>("--seed",
    "Seed for all random number generators. 0 means initialize randomly");
  cli.add<bool>("--check-nan",
    "Check for NaNs or Infs in forward and backward pass. Will abort when found. "
    "This is a diagnostic option that will slow down computation significantly");
  cli.add<bool>("--interpolate-env-vars",
    "allow the use of environment variables in paths, of the form ${VAR_NAME}");
  cli.add<bool>("--relative-paths",
    "All paths are relative to the config file location");
  cli.add<std::string>("--dump-config",
    "Dump current (modified) configuration to stdout and exit. Possible values: full, minimal, expand")
    ->implicit_val("full");
  if(mode_ == cli::mode::training) {
    // --sigterm is deliberately not a boolean, to allow for a consistent
    // pattern of specifying custom signal handling in the future.
    // (e.g., dump model but continue training upon SIGUSR1, or report current
    // training status upon SIGINFO.)
    cli.add<std::string>("--sigterm",
      "What to do with SIGTERM: save-and-exit or exit-immediately.",
      "save-and-exit");
  }
  // clang-format on
}

void ConfigParser::addOptionsServer(cli::CLIWrapper& cli) {
  // clang-format off
  auto previous_group = cli.switchGroup("Server options");
  cli.add<size_t>("--port,-p",
      "Port number for web socket server",
      8080);
  cli.switchGroup(previous_group);
  // clang-format on
}

void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) {
  auto previous_group = cli.switchGroup("Model options");

  // clang-format off
  if(mode_ == cli::mode::translation) {
    cli.add<std::vector<std::string>>("--models,-m",
      "Paths to model(s) to be loaded. Supported file extensions: .npz, .bin");
  } else {
    cli.add<std::string>("--model,-m",
      "Path prefix for model to be saved/resumed. Supported file extensions: .npz, .bin",
      "model.npz");

    if(mode_ == cli::mode::training) {
      cli.add<std::string>("--pretrained-model",
        "Path prefix for pre-trained model to initialize model weights");
    }
  }
#ifdef COMPILE_CPU
  if(mode_ == cli::mode::translation) {
    cli.add<bool>("--model-mmap",
      "Use memory-mapping when loading model (CPU only)");
  }
#endif
  cli.add<bool>("--ignore-model-config",
      "Ignore the model configuration saved in npz file");
  cli.add<std::string>("--type",
      "Model type: amun, nematus, s2s, multi-s2s, transformer",
      "amun");
  cli.add<std::vector<int>>("--dim-vocabs",
      "Maximum items in vocabulary ordered by rank, 0 uses all items in the provided/created vocabulary file",
      {0, 0});
  cli.add<int>("--dim-emb",
      "Size of embedding vector",
      512);
  cli.add<int>("--factors-dim-emb",
      "Embedding dimension of the factors. Only used if concat is selected as factors combining form");
  cli.add<std::string>("--factors-combine",
    "How to combine the factors and lemma embeddings. Options available: sum, concat",
    "sum");
  cli.add<std::string>("--lemma-dependency",
      "Lemma dependency method to use when predicting target factors. Options: soft-transformer-layer, hard-transformer-layer, lemma-dependent-bias, re-embedding");
  cli.add<int>("--lemma-dim-emb",
      "Re-embedding dimension of lemma in factors",
      0);
  cli.add<int>("--dim-rnn",
      "Size of rnn hidden state", 1024);
  cli.add<std::string>("--enc-type",
      "Type of encoder RNN : bidirectional, bi-unidirectional, alternating (s2s)",
      "bidirectional");
  cli.add<std::string>("--enc-cell",
      "Type of RNN cell: gru, lstm, tanh (s2s)", "gru");
  cli.add<int>("--enc-cell-depth",
      "Number of transitional cells in encoder layers (s2s)",
      1);
  cli.add<int>("--enc-depth",
      "Number of encoder layers (s2s)",
      1);
  cli.add<std::string>("--dec-cell",
      "Type of RNN cell: gru, lstm, tanh (s2s)",
      "gru");
  cli.add<int>("--dec-cell-base-depth",
      "Number of transitional cells in first decoder layer (s2s)",
      2);
  cli.add<int>("--dec-cell-high-depth",
      "Number of transitional cells in next decoder layers (s2s)",
      1);
  cli.add<int>("--dec-depth",
      "Number of decoder layers (s2s)",
      1);
  cli.add<bool>("--skip",
      "Use skip connections (s2s)");
  cli.add<bool>("--layer-normalization",
      "Enable layer normalization");
  cli.add<bool>("--right-left",
      "Train right-to-left model");
  cli.add<std::vector<std::string>>("--input-types",
      "Provide type of input data if different than 'sequence'. "
      "Possible values: sequence, class, alignment, weight. "
      "You need to provide one type per input file (if --train-sets) or per TSV field (if --tsv).",
      {});
  cli.add<bool>("--best-deep",
      "Use Edinburgh deep RNN configuration (s2s)");
  cli.add<bool>("--tied-embeddings",
      "Tie target embeddings and output embeddings in output layer");
  cli.add<bool>("--tied-embeddings-src",
      "Tie source and target embeddings");
  cli.add<bool>("--tied-embeddings-all",
      "Tie all embedding layers and output layer");
  cli.add<bool>("--output-omit-bias",
      "Do not use a bias vector in decoder output layer");

  // Transformer options
  cli.add<int>("--transformer-heads",
      "Number of heads in multi-head attention (transformer)",
      8);
  cli.add<bool>("--transformer-no-projection",
      "Omit linear projection after multi-head attention (transformer)");
  cli.add<bool>("--transformer-pool",
      "Pool encoder states instead of using cross attention (selects first encoder state, best used with special token)");
  cli.add<int>("--transformer-dim-ffn",
      "Size of position-wise feed-forward network (transformer)",
      2048);
  cli.add<int>("--transformer-decoder-dim-ffn",
      "Size of position-wise feed-forward network in decoder (transformer). Uses --transformer-dim-ffn if 0.",
      0);
  cli.add<int>("--transformer-ffn-depth",
      "Depth of filters (transformer)",
      2);
  cli.add<int>("--transformer-decoder-ffn-depth",
      "Depth of filters in decoder (transformer). Uses --transformer-ffn-depth if 0",
      0);
  cli.add<std::string>("--transformer-ffn-activation",
      "Activation between filters: swish or relu (transformer)",
      "swish");
  cli.add<int>("--transformer-dim-aan",
      "Size of position-wise feed-forward network in AAN (transformer)",
      2048);
  cli.add<int>("--transformer-aan-depth",
      "Depth of filter for AAN (transformer)",
      2);
  cli.add<std::string>("--transformer-aan-activation",
      "Activation between filters in AAN: swish or relu (transformer)",
      "swish");
  cli.add<bool>("--transformer-aan-nogate",
      "Omit gate in AAN (transformer)");
  cli.add<std::string>("--transformer-decoder-autoreg",
      "Type of autoregressive layer in transformer decoder: self-attention, average-attention (transformer)",
      "self-attention");
  cli.add<std::vector<size_t>>("--transformer-tied-layers",
      "List of tied decoder layers (transformer)");
  cli.add<std::string>("--transformer-guided-alignment-layer",
      "Last or number of layer to use for guided alignment training in transformer",
      "last");
  cli.add<std::string>("--transformer-preprocess",
      "Operation before each transformer layer: d = dropout, a = add, n = normalize");
  cli.add<std::string>("--transformer-postprocess-emb",
      "Operation after transformer embedding layer: d = dropout, a = add, n = normalize",
      "d");
  cli.add<std::string>("--transformer-postprocess",
      "Operation after each transformer layer: d = dropout, a = add, n = normalize",
      "dan");
  cli.add<std::string>("--transformer-postprocess-top",
      "Final operation after a full transformer stack: d = dropout, a = add, n = normalize. The optional skip connection with 'a' by-passes the entire stack.",
      "");
  cli.add<bool>("--transformer-train-position-embeddings",
      "Train positional embeddings instead of using static sinusoidal embeddings");
  cli.add<bool>("--transformer-depth-scaling",
      "Scale down weight initialization in transformer layers by 1 / sqrt(depth)");

  cli.add<std::string>("--bert-mask-symbol", "Masking symbol for BERT masked-LM training", "[MASK]");
  cli.add<std::string>("--bert-sep-symbol", "Sentence separator symbol for BERT next sentence prediction training", "[SEP]");
  cli.add<std::string>("--bert-class-symbol", "Class symbol BERT classifier training", "[CLS]");
  cli.add<float>("--bert-masking-fraction", "Fraction of masked out tokens during training", 0.15f);
  cli.add<bool>("--bert-train-type-embeddings", "Train bert type embeddings, set to false to use static sinusoidal embeddings", true);
  cli.add<int>("--bert-type-vocab-size", "Size of BERT type vocab (sentence A and B)", 2);
#ifdef CUDNN
  cli.add<int>("--char-stride",
      "Width of max-pooling layer after convolution layer in char-s2s model",
      5);
  cli.add<int>("--char-highway",
      "Number of highway network layers after max-pooling in char-s2s model",
      4);
  cli.add<std::vector<int>>("--char-conv-filters-num",
      "Numbers of convolution filters of corresponding width in char-s2s model",
      {200, 200, 250, 250, 300, 300, 300, 300});
  cli.add<std::vector<int>>("--char-conv-filters-widths",
      "Convolution window widths in char-s2s model",
      {1, 2, 3, 4, 5, 6, 7, 8});
#endif

  if(mode_ == cli::mode::training) {
    // TODO: add ->range(0,1);
    cli.add<float>("--dropout-rnn",
        "Scaling dropout along rnn layers and time (0 = no dropout)");
    cli.add<float>("--dropout-src",
        "Dropout source words (0 = no dropout)");
    cli.add<float>("--dropout-trg",
        "Dropout target words (0 = no dropout)");
    cli.add<float>("--transformer-dropout",
        "Dropout between transformer layers (0 = no dropout)");
    cli.add<float>("--transformer-dropout-attention",
        "Dropout for transformer attention (0 = no dropout)");
    cli.add<float>("--transformer-dropout-ffn",
        "Dropout for transformer filter (0 = no dropout)");
  }
  cli.switchGroup(previous_group);
  // clang-format on
}

void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
  auto previous_group = cli.switchGroup("Training options");
  // clang-format off
  cli.add<std::string>("--cost-type", // @TODO: rename to loss-type
      "Optimization criterion: ce-mean, ce-mean-words, ce-sum, perplexity", "ce-sum");
  cli.add<std::string>("--multi-loss-type",
      "How to accumulate multi-objective losses: sum, scaled, mean", "sum");
  cli.add<bool>("--unlikelihood-loss",
      "Use word-level weights as indicators for sequence-level unlikelihood training");
  cli.add<bool>("--overwrite",
      "Do not create model checkpoints, only overwrite main model file with last checkpoint. "
      "Reduces disk usage");
  cli.add<bool>("--no-reload",
      "Do not load existing model specified in --model arg");
  cli.add<std::vector<std::string>>("--train-sets,-t",
      "Paths to training corpora: source target");
  cli.add<std::vector<std::string>>("--vocabs,-v",
      "Paths to vocabulary files have to correspond to --train-sets. "
      "If this parameter is not supplied we look for vocabulary files "
      "source.{yml,json} and target.{yml,json}. "
      "If these files do not exist they are created");
#ifdef USE_SENTENCEPIECE
  cli.add<std::vector<float>>("--sentencepiece-alphas",
      "Sampling factors for SentencePiece vocabulary; i-th factor corresponds to i-th vocabulary");
  cli.add<std::string>("--sentencepiece-options",
      "Pass-through command-line options to SentencePiece trainer");
  cli.add<size_t>("--sentencepiece-max-lines",
      "Maximum lines to train SentencePiece vocabulary, selected with sampling from all data. "
      "When set to 0 all lines are going to be used.",
      2000000);
#endif
  // scheduling options

  // @TODO: these should be re-defined as aliases for `--after` but the current frame work matches on value, so not doable.
  cli.add<size_t>("--after-epochs,-e",
      "Finish after this many epochs, 0 is infinity (deprecated, '--after-epochs N' corresponds to '--after Ne')"); // @TODO: replace with alias
  cli.add<size_t>("--after-batches",
      "Finish after this many batch updates, 0 is infinity (deprecated, '--after-batches N' corresponds to '--after Nu')"); // @TODO: replace with alias

  cli.add<std::string>("--after,-a",
      "Finish after this many chosen training units, 0 is infinity (e.g. 100e = 100 epochs, 10Gt = 10 billion target labels, 100Ku = 100,000 updates",
      "0e");
  cli.add<std::string/*SchedulerPeriod*/>("--disp-freq",
      "Display information every arg updates (append 't' for every arg target labels)",
      "1000u");
  cli.add<size_t>("--disp-first",
      "Display information for the first arg updates");
  cli.add<bool>("--disp-label-counts",
      "Display label counts when logging loss progress",
      true);
//   cli.add<int>("--disp-label-index",
//       "Display label counts based on i-th input stream (-1 is last)", -1);
  cli.add<std::string/*SchedulerPeriod*/>("--save-freq",
      "Save model file every arg updates (append 't' for every arg target labels)",
      "10000u");
  cli.add<std::vector<std::string>>("--logical-epoch",
      "Redefine logical epoch counter as multiple of data epochs (e.g. 1e), updates (e.g. 100Ku) or labels (e.g. 1Gt). "
      "Second parameter defines width of fractional display, 0 by default.",
      {"1e", "0"});

  addSuboptionsInputLength(cli);
  addSuboptionsTSV(cli);

  // data management options
  cli.add<std::string>("--shuffle",
      "How to shuffle input data (data: shuffles data and sorted batches; batches: "
      "data is read in order into batches, but batches are shuffled; none: no shuffling). "
      "Use with '--maxi-batch-sort none' in order to achieve exact reading order", "data");
  cli.add<bool>("--no-shuffle",
      "Shortcut for backwards compatiblity, equivalent to --shuffle none (deprecated)");
  cli.add<bool>("--no-restore-corpus",
      "Skip restoring corpus state after training is restarted");
  cli.add<std::string>("--tempdir,-T",
      "Directory for temporary (shuffled) files and database",
      "/tmp");
  cli.add<std::string>("--sqlite",
      "Use disk-based sqlite3 database for training corpus storage, default"
      " is temporary with path creates persistent storage")
    ->implicit_val("temporary");
  cli.add<bool>("--sqlite-drop",
      "Drop existing tables in sqlite3 database");

  addSuboptionsDevices(cli);
  addSuboptionsBatching(cli);

  // optimizer options
  cli.add<std::string>("--optimizer,-o",
     "Optimization algorithm: sgd, adagrad, adam",
     "adam");
  cli.add<std::vector<float>>("--optimizer-params",
     "Parameters for optimization algorithm, e.g. betas for Adam. "
     "Auto-adjusted to --mini-batch-words-ref if given");
  cli.add<float>("--optimizer-delay",
     "SGD update delay (#batches between updates). 1 = no delay. "
     "Can be fractional, e.g. 0.1 to use only 10% of each batch",
     1.f);

  cli.add<bool>("--sync-sgd",
     "Use synchronous SGD instead of asynchronous for multi-gpu training");

  // learning rate options
  cli.add<float>("--learn-rate,-l",
     "Learning rate. "
      "Auto-adjusted to --mini-batch-words-ref if given",
     0.0001f);
  cli.add<bool>("--lr-report",
     "Report learning rate for each update");

  cli.add<float>("--lr-decay",
     "Per-update decay factor for learning rate: lr <- lr * arg (0 to disable)");
  cli.add<std::string>("--lr-decay-strategy",
     "Strategy for learning rate decaying: epoch, batches, stalled, epoch+batches, epoch+stalled",
     "epoch+stalled");
  cli.add<std::vector<size_t>>("--lr-decay-start",
     "The first number of (epoch, batches, stalled) validations to start learning rate decaying (tuple)",
     {10, 1});
  cli.add<size_t>("--lr-decay-freq",
     "Learning rate decaying frequency for batches, requires --lr-decay-strategy to be batches",
     50000);
  cli.add<bool>("--lr-decay-reset-optimizer",
      "Reset running statistics of optimizer whenever learning rate decays");
  cli.add<bool>("--lr-decay-repeat-warmup",
     "Repeat learning rate warmup when learning rate is decayed");
  cli.add<std::vector<std::string/*SchedulerPeriod*/>>("--lr-decay-inv-sqrt",
     "Decrease learning rate at arg / sqrt(no. batches) starting at arg (append 't' or 'e' for sqrt(target labels or epochs)). "
     "Add second argument to define the starting point (default: same as first value)",
     {"0"});

  cli.add<std::string/*SchedulerPeriod*/>("--lr-warmup",
     "Increase learning rate linearly for arg first batches (append 't' for arg first target labels)",
      "0");
  cli.add<float>("--lr-warmup-start-rate",
     "Start value for learning rate warmup");
  cli.add<bool>("--lr-warmup-cycle",
     "Apply cyclic warmup");
  cli.add<bool>("--lr-warmup-at-reload",
     "Repeat warmup after interrupted training");

  cli.add<double>("--label-smoothing",
     "Epsilon for label smoothing (0 to disable)");
  cli.add<double>("--factor-weight",
     "Weight for loss function for factors (factored vocab only) (1 to disable)", 1.0f);
  cli.add<float>("--clip-norm",
     "Clip gradient norm to arg (0 to disable)",
     1.f); // @TODO: this is currently wrong with ce-sum and should rather be disabled or fixed by multiplying with labels
  cli.add<float>("--exponential-smoothing",
     "Maintain smoothed version of parameters for validation and saving with smoothing factor. 0 to disable. "
      "Auto-adjusted to --mini-batch-words-ref if given.",
     0.f)->implicit_val("1e-4");
  cli.add<std::string>("--guided-alignment",
     "Path to a file with word alignments. Use guided alignment to guide attention or 'none'. "
     "If --tsv it specifies the index of a TSV field that contains the alignments (0-based)",
     "none");
  cli.add<std::string>("--guided-alignment-cost",
     "Cost type for guided alignment: ce (cross-entropy), mse (mean square error), mult (multiplication)",
     "ce");
  cli.add<double>("--guided-alignment-weight",
     "Weight for guided alignment cost",
     0.1);
  cli.add<std::string>("--data-weighting",
     "Path to a file with sentence or word weights. "
     "If --tsv it specifies the index of a TSV field that contains the weights (0-based)");
  cli.add<std::string>("--data-weighting-type",
     "Processing level for data weighting: sentence, word",
     "sentence");

  // embedding options
  cli.add<std::vector<std::string>>("--embedding-vectors",
     "Paths to files with custom source and target embedding vectors");
  cli.add<bool>("--embedding-normalization",
     "Normalize values from custom embedding vectors to [-1, 1]");
  cli.add<bool>("--embedding-fix-src",
     "Fix source embeddings. Affects all encoders");
  cli.add<bool>("--embedding-fix-trg",
     "Fix target embeddings. Affects all decoders");

  // mixed precision training
  cli.add<bool>("--fp16",
      "Shortcut for mixed precision training with float16 and cost-scaling, "
      "corresponds to: --precision float16 float32 --cost-scaling 256.f 1000 2.f 256.f");
  cli.add<std::vector<std::string>>("--precision",
      "Mixed precision training for forward/backward pass and optimizaton. "
      "Defines types for: forward/backward pass, optimization.",
      {"float32", "float32"});
  cli.add<std::vector<std::string>>("--cost-scaling",
      "Dynamic cost scaling for mixed precision training: "
      "scaling factor, frequency, multiplier, minimum factor")
      ->implicit_val("256.f 1000 2.f 256.f");
  cli.add<size_t>("--gradient-norm-average-window",
      "Window size over which the exponential average of the gradient norm is recorded (for logging and scaling). "
      "After this many updates about 90% of the mass of the exponential average comes from these updates",
      100);
  cli.add<std::vector<std::string>>("--dynamic-gradient-scaling",
      "Re-scale gradient to have average gradient norm if (log) gradient norm diverges from average by arg1 sigmas. "
      "If arg2 = \"log\" the statistics are recorded for the log of the gradient norm else use plain norm")
      ->implicit_val("2.f log");
  cli.add<bool>("--check-gradient-nan",
      "Skip parameter update in case of NaNs in gradient");
  cli.add<bool>("--normalize-gradient",
      "Normalize gradient by multiplying with no. devices / total labels (not recommended and to be removed in the future)");

  cli.add<std::vector<std::string>>("--train-embedder-rank",
      "Override model configuration and train a embedding similarity ranker with the model encoder, "
      "parameters encode margin and an optional normalization factor")
    ->implicit_val("0.3f 0.0f");

  // model quantization training
  addSuboptionsQuantization(cli);

  // add ULR settings
  addSuboptionsULR(cli);

  cli.add<std::vector<std::string>>("--task",
     "Use predefined set of options. Possible values: transformer-base, transformer-big, "
     "transformer-base-prenorm, transformer-big-prenorm");
  cli.switchGroup(previous_group);
  // clang-format on
}

void ConfigParser::addOptionsValidation(cli::CLIWrapper& cli) {
  auto previous_group = cli.switchGroup("Validation set options");

  // clang-format off
  cli.add<std::vector<std::string>>("--valid-sets",
      "Paths to validation corpora: source target");
  cli.add<std::string/*SchedulerPeriod*/>("--valid-freq",
      "Validate model every arg updates (append 't' for every arg target labels)",
      "10000u");
  cli.add<std::vector<std::string>>("--valid-metrics",
      "Metric to use during validation: cross-entropy, ce-mean-words, perplexity, valid-script, "
      "translation, bleu, bleu-detok (deprecated, same as bleu), bleu-segmented, chrf. "
      "Multiple metrics can be specified",
      {"cross-entropy"});
  cli.add<bool>("--valid-reset-stalled",
     "Reset all stalled validation metrics when the training is restarted");
  cli.add<size_t>("--early-stopping",
     "Stop if the first validation metric does not improve for arg consecutive validation steps",
     10);
  cli.add<std::string>("--early-stopping-on",
      "Decide if early stopping should take into account first, all, or any validation metrics"
      "Possible values: first, all, any",
      "first");

  // decoding options
  cli.add<size_t>("--beam-size,-b",
      "Beam size used during search with validating translator",
      12);
  cli.add<float>("--normalize,-n",
      "Divide translation score by pow(translation length, arg)",
      0)->implicit_val("1");
  cli.add<float>("--max-length-factor",
      "Maximum target length as source length times factor",
      3);
  cli.add<float>("--word-penalty",
      "Subtract (arg * translation length) from translation score");
  cli.add<bool>("--allow-unk",
      "Allow unknown words to appear in output");
  cli.add<bool>("--n-best",
      "Generate n-best list");
  cli.add<bool>("--word-scores",
      "Print word-level scores. One score per subword unit, not normalized even if --normalize");

  // efficiency options
  cli.add<int>("--valid-mini-batch",
      "Size of mini-batch used during validation",
      32);
  cli.add<size_t>("--valid-max-length",
      "Maximum length of a sentence in a validating sentence pair. "
      "Sentences longer than valid-max-length are cropped to valid-max-length",
      1000);

  // options for validation script
  cli.add<std::string>("--valid-script-path",
     "Path to external validation script."
     " It should print a single score to stdout."
     " If the option is used with validating translation, the output"
     " translation file will be passed as a first argument");
  cli.add<std::vector<std::string>>("--valid-script-args",
      "Additional args passed to --valid-script-path. These are inserted"
      " between the script path and the output translation-file path");
  cli.add<std::string>("--valid-translation-output",
     "(Template for) path to store the translation. "
     "E.g., validation-output-after-{U}-updates-{T}-tokens.txt. Template "
     "parameters: {E} for epoch; {B} for No. of batches within epoch; "
     "{U} for total No. of updates; {T} for total No. of tokens seen.");
  cli.add<bool>("--keep-best",
      "Keep best model for each validation metric");
  cli.add<std::string>("--valid-log",
     "Log validation scores to file given by arg");
  cli.switchGroup(previous_group);
  // clang-format on
}

void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
  auto previous_group = cli.switchGroup("Translator options");

  // clang-format off
  cli.add<std::vector<std::string>>("--input,-i",
      "Paths to input file(s), stdin by default",
      {"stdin"});
  cli.add<std::string>("--output,-o",
      "Path to output file, stdout by default",
      "stdout");
  cli.add<std::vector<std::string>>("--vocabs,-v",
      "Paths to vocabulary files have to correspond to --input");
  // decoding options
  cli.add<size_t>("--beam-size,-b",
      "Beam size used during search with validating translator",
      12);
  cli.add<float>("--normalize,-n",
      "Divide translation score by pow(translation length, arg)",
      0)->implicit_val("1");
  cli.add<float>("--max-length-factor",
      "Maximum target length as source length times factor",
      3);
  cli.add<float>("--word-penalty",
      "Subtract (arg * translation length) from translation score");
  cli.add<bool>("--allow-unk",
      "Allow unknown words to appear in output");
  cli.add<bool>("--allow-special",
      "Allow special symbols to appear in output, e.g. for SentencePiece with byte-fallback do not suppress the newline symbol");
  cli.add<bool>("--n-best",
      "Generate n-best list");
  cli.add<std::string>("--alignment",
     "Return word alignment. Possible values: 0.0-1.0, hard, soft")
    ->implicit_val("1");
  cli.add<bool>("--word-scores",
      "Print word-level scores. One score per subword unit, not normalized even if --normalize");
  cli.add<std::string/*SchedulerPeriod*/>("--stat-freq",
    "Display speed information every arg mini-batches. Disabled by default with 0, set to value larger than 0 to activate",
    "0");
#ifdef USE_SENTENCEPIECE
  cli.add<bool>("--no-spm-decode",
      "Keep the output segmented into SentencePiece subwords");
#endif

  addSuboptionsInputLength(cli);
  addSuboptionsTSV(cli);
  addSuboptionsDevices(cli);
  addSuboptionsBatching(cli);

  cli.add<bool>("--fp16",
      "Shortcut for mixed precision inference with float16, corresponds to: --precision float16");
  cli.add<std::vector<std::string>>("--precision",
      "Mixed precision for inference, set parameter type in expression graph",
      {"float32"});
  cli.add<bool>("--skip-cost",
    "Ignore model cost during translation, not recommended for beam-size > 1");

  cli.add<std::vector<std::string>>("--shortlist",
     "Use softmax shortlist: path first best prune");
  cli.add<std::vector<float>>("--weights",
      "Scorer weights");
  cli.add<std::vector<std::string>>("--output-sampling",
     "Noise output layer with gumbel noise. Implicit default is 'full' for sampling from full distribution. "
     " Also accepts 'topk num' (e.g. topk 100) for top-100 sampling.")
     ->implicit_val("full");
  cli.add<std::vector<int>>("--output-approx-knn",
     "Use approximate knn search in output layer (currently only in transformer)")
     ->implicit_val("100 1024");

  // parameters for on-line quantization
  cli.add<bool>("--optimize",
      "Optimize the graph on-the-fly", false);
  cli.add<std::string>("--gemm-type,-g",
     "GEMM Type to be used for on-line quantization/packing: float32, packed16, packed8", "float32");
  cli.add<float>("--quantize-range",
     "Range for the on-line quantiziation of weight matrix in multiple of this range and standard deviation, 0.0 means min/max quantization",
     0.f);

#if 0 // @TODO: Ask Hany if there are any decoding-time options
  // add ULR settings
  addSuboptionsULR(cli);
#endif

  cli.switchGroup(previous_group);
  // clang-format on
}

void ConfigParser::addOptionsScoring(cli::CLIWrapper& cli) {
  auto previous_group = cli.switchGroup("Scorer options");

  // clang-format off
  cli.add<bool>("--no-reload",
      "Do not load existing model specified in --model arg");
  // TODO: move options like vocabs and train-sets to a separate procedure as they are defined twice
  cli.add<std::vector<std::string>>("--train-sets,-t",
      "Paths to corpora to be scored: source target");
  cli.add<std::string>("--output,-o",
      "Path to output file, stdout by default",
      "stdout");
  cli.add<std::vector<std::string>>("--vocabs,-v",
      "Paths to vocabulary files have to correspond to --train-sets. "
      "If this parameter is not supplied we look for vocabulary files source.{yml,json} and target.{yml,json}. "
      "If these files do not exists they are created");
  cli.add<bool>("--n-best",
      "Score n-best list instead of plain text corpus");
  cli.add<std::string>("--n-best-feature",
      "Feature name to be inserted into n-best list", "Score");
  cli.add<bool>("--normalize,-n",
      "Divide translation score by translation length");
  cli.add<std::string>("--summary",
      "Only print total cost, possible values: cross-entropy (ce-mean), ce-mean-words, ce-sum, perplexity")
      ->implicit_val("cross-entropy");
  cli.add<std::string>("--alignment",
     "Return word alignments. Possible values: 0.0-1.0, hard, soft")
     ->implicit_val("1"),
  cli.add<bool>("--word-scores",
      "Print word-level scores. One score per subword unit, not normalized even if --normalize");

  addSuboptionsInputLength(cli);
  addSuboptionsTSV(cli);
  addSuboptionsDevices(cli);
  addSuboptionsBatching(cli);

  cli.add<bool>("--fp16",
      "Shortcut for mixed precision inference with float16, corresponds to: --precision float16");
  cli.add<std::vector<std::string>>("--precision",
      "Mixed precision for inference, set parameter type in expression graph",
      {"float32"});

  // parameters for on-line quantization
  cli.add<bool>("--optimize",
      "Optimize the graph on-the-fly", false);
  cli.add<std::string>("--gemm-type,-g",
     "GEMM Type to be used for on-line quantization/packing: float32, packed16, packed8", "float32");
  cli.add<float>("--quantize-range",
     "Range for the on-line quantiziation of weight matrix in multiple of this range and standard deviation, 0.0 means min/max quantization",
     0.f);

  cli.switchGroup(previous_group);
  // clang-format on
}

void ConfigParser::addOptionsEmbedding(cli::CLIWrapper& cli) {
  auto previous_group = cli.switchGroup("Scorer options");

  // clang-format off
  cli.add<bool>("--no-reload",
      "Do not load existing model specified in --model arg");
  // TODO: move options like vocabs and train-sets to a separate procedure as they are defined twice
  cli.add<std::vector<std::string>>("--train-sets,-t",
      "Paths to corpora to be scored: source target");
  cli.add<std::string>("--output,-o",
      "Path to output file, stdout by default",
      "stdout");
  cli.add<std::vector<std::string>>("--vocabs,-v",
      "Paths to vocabulary files have to correspond to --train-sets. "
      "If this parameter is not supplied we look for vocabulary files source.{yml,json} and target.{yml,json}. "
      "If these files do not exists they are created");

  cli.add<bool>("--compute-similarity",
      "Expect two inputs and compute cosine similarity instead of outputting embedding vector");
  cli.add<bool>("--binary",
      "Output vectors as binary floats");

  addSuboptionsInputLength(cli);
  addSuboptionsTSV(cli);
  addSuboptionsDevices(cli);
  addSuboptionsBatching(cli);

  cli.add<bool>("--fp16",
      "Shortcut for mixed precision inference with float16, corresponds to: --precision float16");
  cli.add<std::vector<std::string>>("--precision",
      "Mixed precision for inference, set parameter type in expression graph. Supported values: float32, float16",
      {"float32"});

  cli.switchGroup(previous_group);
  // clang-format on
}

void ConfigParser::addSuboptionsDevices(cli::CLIWrapper& cli) {
  // clang-format off
  cli.add<std::vector<std::string>>("--devices,-d",
      "Specifies GPU ID(s) to use for training. Defaults to 0..num-devices-1",
      {"0"});
  cli.add<size_t>("--num-devices",
      "Number of GPUs to use for this process. Defaults to length(devices) or 1");
#ifdef USE_NCCL
  if(mode_ == cli::mode::training) {
    cli.add<bool>("--no-nccl",
      "Disable inter-GPU communication via NCCL");
    cli.add<std::string>("--sharding",
      "When using NCCL and MPI for multi-process training use 'global' (default, less memory usage) "
      "or 'local' (more memory usage but faster) sharding",
      {"global"});
    cli.add<std::string/*SchedulerPeriod*/>("--sync-freq",
      "When sharding is local sync all shards across processes once every n steps (possible units u=updates, t=target labels, e=epochs)",
      "200u");
  }
#endif
#ifdef CUDA_FOUND
  cli.add<size_t>("--cpu-threads",
      "Use CPU-based computation with this many independent threads, 0 means GPU-based computation",
      0)
    ->implicit_val("1");
#else
  cli.add<size_t>("--cpu-threads",
      "Use CPU-based computation with this many independent threads, 0 means GPU-based computation",
      1);
#endif
  // clang-format on
}

void ConfigParser::addSuboptionsBatching(cli::CLIWrapper& cli) {
  int defaultMiniBatch = (mode_ == cli::mode::translation) ? 1 : 64;
  int defaultMaxiBatch = (mode_ == cli::mode::translation) ? 1 : 100;
  std::string defaultMaxiBatchSort = (mode_ == cli::mode::translation) ? "none" : "trg";

  // clang-format off
  cli.add<int>("--mini-batch",
               // set accurate help messages for translation, scoring, or training
               (mode_ == cli::mode::translation)
                   ? "Size of mini-batch used during batched translation" :
               (mode_ == cli::mode::scoring)
                   ? "Size of mini-batch used during batched scoring"
                   : "Size of mini-batch used during update",
               defaultMiniBatch);
  cli.add<int>("--mini-batch-words",
      "Set mini-batch size based on words instead of sentences");

  if(mode_ == cli::mode::training) {
    cli.add<bool>("--mini-batch-fit",
      "Determine mini-batch size automatically based on sentence-length to fit reserved memory");
    cli.add<size_t>("--mini-batch-fit-step",
      "Step size for mini-batch-fit statistics",
      10);
    cli.add<bool>("--gradient-checkpointing",
      "Enable gradient-checkpointing to minimize memory usage");
  }

  cli.add<int>("--maxi-batch",
      "Number of batches to preload for length-based sorting",
      defaultMaxiBatch);
  cli.add<std::string>("--maxi-batch-sort",
      "Sorting strategy for maxi-batch: none, src, trg (not available for decoder)",
      defaultMaxiBatchSort);

  if(mode_ == cli::mode::training) {
    cli.add<bool>("--shuffle-in-ram",
        "Keep shuffled corpus in RAM, do not write to temp file");

#if DETERMINISTIC
    cli.add<size_t>("--data-threads",
        "Number of concurrent threads to use during data reading and processing", 1);
#else
    cli.add<size_t>("--data-threads",
        "Number of concurrent threads to use during data reading and processing", 8);
#endif

    // @TODO: Consider making the next two options options of the vocab instead, to make it more local in scope.
    cli.add<size_t>("--all-caps-every",
        "When forming minibatches, preprocess every Nth line on the fly to all-caps. Assumes UTF-8");
    cli.add<size_t>("--english-title-case-every",
        "When forming minibatches, preprocess every Nth line on the fly to title-case. Assumes English (ASCII only)");

    cli.add<size_t>("--mini-batch-words-ref",
        "If given, the following hyper parameters are adjusted as-if we had this mini-batch size: "
        "--learn-rate, --optimizer-params, --exponential-smoothing, --mini-batch-warmup");
    cli.add<std::string/*SchedulerPeriod*/>("--mini-batch-warmup",
        "Linear ramp-up of MB size, up to this #updates (append 't' for up to this #target labels). "
        "Auto-adjusted to --mini-batch-words-ref if given",
        {"0"});
    cli.add<bool>("--mini-batch-track-lr",
        "Dynamically track mini-batch size inverse to actual learning rate (not considering lr-warmup)");
    cli.add<bool>("--mini-batch-round-up",
        "Round up batch size to next power of 2 for more efficient training, but this can make batch size less stable. Disable with --mini-batch-round-up=false",
        true);
  } else {
#if DETERMINISTIC
    cli.add<size_t>("--data-threads",
        "Number of concurrent threads to use during data reading and processing", 1);
#else
    cli.add<size_t>("--data-threads",
        "Number of concurrent threads to use during data reading and processing", 8);
#endif
  }
  // clang-format on
}

void ConfigParser::addSuboptionsInputLength(cli::CLIWrapper& cli) {
  size_t defaultMaxLength = (mode_ == cli::mode::training) ? 50 : 1000;
  // clang-format off
  cli.add<size_t>("--max-length",
      "Maximum length of a sentence in a training sentence pair",
      defaultMaxLength);
  cli.add<bool>("--max-length-crop",
      "Crop a sentence to max-length instead of omitting it if longer than max-length");
  // clang-format on
}

void ConfigParser::addSuboptionsTSV(cli::CLIWrapper& cli) {
  // clang-format off
  cli.add<bool>("--tsv",
      "Tab-separated input");
  cli.add<size_t>("--tsv-fields",
      "Number of fields in the TSV input. By default, it is guessed based on the model type");
  // clang-format on
}

void ConfigParser::addSuboptionsULR(cli::CLIWrapper& cli) {
  // clang-format off
  // support for universal encoder ULR https://arxiv.org/pdf/1802.05368.pdf
  cli.add<bool>("--ulr",
      "Enable ULR (Universal Language Representation)");
  // reading pre-trained universal embeddings for multi-sources.
  // Note that source and target here is relative to ULR not the translation langs
  // queries: EQ in Fig2 : is the unified embeddings projected to one space.
  cli.add<std::string>("--ulr-query-vectors",
      "Path to file with universal sources embeddings from projection into universal space",
      "");
  // keys: EK in Fig2 : is the keys of the target embeddings projected to unified space (i.e. ENU in
  // multi-lingual case)
  cli.add<std::string>("--ulr-keys-vectors",
      "Path to file with universal sources embeddings of target keys from projection into universal space",
      "");
  cli.add<bool>("--ulr-trainable-transformation",
      "Make Query Transformation Matrix A trainable");
  cli.add<int>("--ulr-dim-emb",
      "ULR monolingual embeddings dimension");
  cli.add<float>("--ulr-dropout",
      "ULR dropout on embeddings attentions. Default is no dropout",
      0.0f);
  cli.add<float>("--ulr-softmax-temperature",
      "ULR softmax temperature to control randomness of predictions. Deafult is 1.0: no temperature",
      1.0f);
  // clang-format on
}

void ConfigParser::addSuboptionsQuantization(cli::CLIWrapper& cli) {
  // clang-format off
  // model quantization training
  cli.add<size_t>("--quantize-bits",
     "Number of bits to compress model to. Set to 0 to disable",
      0);
  cli.add<size_t>("--quantize-optimization-steps",
     "Adjust quantization scaling factor for N steps",
     0);
  cli.add<bool>("--quantize-log-based",
     "Uses log-based quantization");
  cli.add<bool>("--quantize-biases",
     "Apply quantization to biases");
  // clang-format on
}

cli::mode ConfigParser::getMode() const { return mode_; }

Ptr<Options> ConfigParser::parseOptions(int argc, char** argv, bool doValidate) {
  cmdLine_ = escapeCmdLine(argc,argv);

  // parse command-line options and fill wrapped YAML config
  cli_.parse(argc, argv);

  if(get<bool>("authors")) {
    std::cerr << authors() << std::endl;
    exit(0);
  }

  if(get<bool>("cite")) {
    std::cerr << citation() << std::endl;
    exit(0);
  }

  auto buildInfo = get<std::string>("build-info");
  if(!buildInfo.empty() && buildInfo != "false") {
#ifdef BUILD_INFO_AVAILABLE // cmake build options are not available on MSVC based build.
    if(buildInfo == "all")
      std::cerr << cmakeBuildOptionsAdvanced() << std::endl;
    else
      std::cerr << cmakeBuildOptions() << std::endl;
    exit(0);
#else // BUILD_INFO_AVAILABLE
    ABORT("build-info is not available on MSVC based build unless compiled via CMake.");
#endif // BUILD_INFO_AVAILABLE
  }

  // get paths to extra config files
  auto configPaths = findConfigPaths();
  if(!configPaths.empty()) {
    auto config = loadConfigFiles(configPaths);
    cli_.updateConfig(config,
                     cli::OptionPriority::ConfigFile,
                     "There are option(s) in a config file that are not expected");
  }

  if(get<bool>("interpolate-env-vars")) {
    cli::processPaths(config_, cli::interpolateEnvVars, PATHS);
  }

  // Option shortcuts for input from STDIN for trainer and scorer
  if(mode_ == cli::mode::training || mode_ == cli::mode::scoring) {
    auto trainSets = get<std::vector<std::string>>("train-sets");
    YAML::Node config;
    // Assume the input will come from STDIN if --tsv is set but no --train-sets are given
    if(get<bool>("tsv") && trainSets.empty()) {
      config["train-sets"].push_back("stdin");
    // Assume the input is in TSV format if --train-sets is set to "stdin"
    } else if(trainSets.size() == 1 && (trainSets[0] == "stdin" || trainSets[0] == "-")) {
      config["tsv"] = true;
    }
    if(!config.IsNull())
      cli_.updateConfig(config, cli::OptionPriority::CommandLine, "A shortcut for STDIN failed.");
  }

  if(doValidate) {
    ConfigValidator(config_).validateOptions(mode_);
  }

  // remove extra config files from the config to avoid redundancy
  config_.remove("config");

  // dump config and exit
  if(!get<std::string>("dump-config").empty() && get<std::string>("dump-config") != "false") {
    auto dumpMode = get<std::string>("dump-config");
    config_.remove("dump-config");

    if(dumpMode == "expand") {
      cli_.parseAliases();
    }

    bool minimal = (dumpMode == "minimal" || dumpMode == "expand");
    std::cout << cli_.dumpConfig(minimal) << std::endl;
    exit(0);
  }

  // For TSV input, it is possible to use --input-types to determine fields that contain alignments
  // or weights. In such case, the position of 'alignment' input type in --input-types determines
  // the index of a TSV field that contains word alignments, and respectively, the position of
  // 'weight' in --input-types determines the index of a TSV field that contains weights.
  // Marian will abort if both the --guided-alignment and 'alignment' in --input-types are specified
  // (or --data-weighting and 'weight').
  //
  // Note: this may modify the config, so it is safer to do it after --dump-config.
  if(mode_ == cli::mode::training || get<bool>("tsv")) {
    auto inputTypes = get<std::vector<std::string>>("input-types");
    if(!inputTypes.empty()) {
      bool seenAligns = false;
      bool seenWeight = false;
      YAML::Node config;
      for(size_t i = 0; i < inputTypes.size(); ++i) {
        if(inputTypes[i] == "alignment") {
          ABORT_IF(seenAligns, "You can specify 'alignment' only once in input-types");
          ABORT_IF(has("guided-alignment") && get<std::string>("guided-alignment") != "none",
                   "You must use either guided-alignment or 'alignment' in input-types");
          config["guided-alignment"] = std::to_string(i);
          seenAligns = true;
        }
        if(inputTypes[i] == "weight") {
          ABORT_IF(seenWeight, "You can specify 'weight' only once in input-types");
          ABORT_IF(has("data-weighting") && !get<std::string>("data-weighting").empty(),
                   "You must use either data-weighting or 'weight' in input-types");
          config["data-weighting"] = std::to_string(i);
          seenWeight = true;
        }
      }
      if(!config.IsNull())
        cli_.updateConfig(config,
                          cli::OptionPriority::CommandLine,
                          "Extracting 'alignment' and 'weight' types from input-types failed.");
    }
  }

#if 0 // @TODO: remove once fully deprecated
  // Convert --after-batches N to --after Nu and --after-epochs N to --after Ne, different values get concatenated with ","
  if(mode_ == cli::mode::training && get<size_t>("after-epochs") > 0) {
    auto afterValue = get<size_t>("after-epochs");
    LOG(info, "\"--after-epochs {}\" is deprecated, please use \"--after {}e\" instead (\"e\" stands for epoch)", afterValue, afterValue);
    YAML::Node config;
    std::string prevAfter = get<std::string>("after");
    std::string converted = std::to_string(afterValue) + "e";
    if(prevAfter != "0e")
      config["after"] = prevAfter + "," + converted;
    else
      config["after"] = converted;
    if(!config.IsNull())
      cli_.updateConfig(config,
                        cli::OptionPriority::CommandLine,
                        "Could not update --after with value from --after-epochs");
  }
  if(mode_ == cli::mode::training && get<size_t>("after-batches") > 0) {
    auto afterValue = get<size_t>("after-batches");
    LOG(info, "\"--after-batches {}\" is deprecated, please use \"--after {}u\" instead (\"u\" stands for updates)", afterValue, afterValue);
    YAML::Node config;
    std::string prevAfter = get<std::string>("after");
    std::string converted = std::to_string(afterValue) + "u";
    if(prevAfter != "0e")
      config["after"] = prevAfter + "," + converted;
    else
      config["after"] = converted;
    if(!config.IsNull())
      cli_.updateConfig(config,
                        cli::OptionPriority::CommandLine,
                        "Could not update --after with value from --after-updates");
  }
#endif

  cli_.parseAliases();
  auto opts = New<Options>();
  opts->merge(Config(*this).get());
  return opts;
}

std::vector<std::string> ConfigParser::findConfigPaths() {
  std::vector<std::string> paths;

  bool interpolateEnvVars = get<bool>("interpolate-env-vars");
  bool loadConfig = !config_["config"].as<std::vector<std::string>>().empty();

  if(loadConfig) {
    paths = config_["config"].as<std::vector<std::string>>();
    for(auto& path : paths) {
      // (note: this updates the paths array)
      if(interpolateEnvVars)
        path = cli::interpolateEnvVars(path);
    }
  } else if(mode_ == cli::mode::training) {
    auto path = config_["model"].as<std::string>() + ".yml";
    if(interpolateEnvVars)
      path = cli::interpolateEnvVars(path);

    bool reloadConfig = filesystem::exists(path) && !get<bool>("no-reload");
    if(reloadConfig)
      paths = {path};
  }

  return paths;
}

YAML::Node ConfigParser::loadConfigFiles(const std::vector<std::string>& paths) {
  YAML::Node configAll;

  for(auto& path : paths) {
    // load single config file
    io::InputFileStream strm(path);
    YAML::Node config = YAML::Load(strm);

    // expand relative paths if requested
    if(config["relative-paths"] && config["relative-paths"].as<bool>()) {
      // interpolate environment variables if requested in this config file or
      // via command-line options
      bool interpolateEnvVars = (config["interpolate-env-vars"]
                                 && config["interpolate-env-vars"].as<bool>())
                                || get<bool>("interpolate-env-vars");
      if(interpolateEnvVars)
        cli::processPaths(config, cli::interpolateEnvVars, PATHS);

      // replace relative path w.r.t. the config file
      cli::makeAbsolutePaths(config, path, PATHS);
      // remove 'relative-paths' and do not spread it into other config files
      config.remove("relative-paths");
    }

    // merge with previous config files, later file overrides earlier
    for(const auto& it : config) {
      configAll[it.first.as<std::string>()] = YAML::Clone(it.second);
    }
  }

  return configAll;
}

const YAML::Node& ConfigParser::getConfig() const {
  return config_;
}
}  // namespace marian