.. _program_listing_file_src_common_config_parser.cpp: Program Listing for File config_parser.cpp ========================================== |exhale_lsh| :ref:`Return to documentation for file ` (``src/common/config_parser.cpp``) .. |exhale_lsh| unicode:: U+021B0 .. UPWARDS ARROW WITH TIP LEFTWARDS .. code-block:: cpp #include "common/authors.h" #include "common/build_info.h" #include "common/cli_helper.h" #include "common/config.h" #include "common/config_parser.h" #include "common/config_validator.h" #include "common/definitions.h" #include "common/file_stream.h" #include "common/logging.h" #include "common/options.h" #include "common/regex.h" #include "common/utils.h" #include #include #include #include #if MKL_FOUND #include #else #if BLAS_FOUND #include #endif #endif namespace marian { // TODO: Move this to CLIWrapper and allow to mark options as paths in the same place they are // defined // clang-format off const std::set PATHS = { "model", "models", "train-sets", "vocabs", "embedding-vectors", "valid-sets", "valid-script-path", "valid-script-args", "valid-log", "valid-translation-output", "input", // except: 'stdin', handled in makeAbsolutePaths and interpolateEnvVars "output", // except: 'stdout', handled in makeAbsolutePaths and interpolateEnvVars "pretrained-model", "data-weighting", "log", "sqlite", // except: 'temporary', handled in the processPaths function "shortlist", // except: only the first element in the sequence is a path, handled in the // processPaths function }; // clang-format on std::string escapeCmdLine(int argc, char** argv){ std::string cmdLine; for(int i = 0; i < argc; i++) { std::string arg = argv[i]; std::string quote; // attempt to quote special chars if(arg.empty() || arg.find_first_of(" #`\"'\\${}|&^?*!()%><") != std::string::npos) quote = "'"; arg = regex::regex_replace(arg, regex::regex("'"), "'\\''"); if(!cmdLine.empty()) cmdLine.push_back(' '); cmdLine += quote + arg + quote; } return cmdLine; } std::string const& ConfigParser::cmdLine() const { return cmdLine_; } ConfigParser::ConfigParser(cli::mode mode) : cli_(config_,"Marian: Fast Neural Machine Translation in C++", "General options", "", 40), mode_(mode == cli::mode::server ? cli::mode::translation : mode) { addOptionsGeneral(cli_); if (mode == cli::mode::server) addOptionsServer(cli_); addOptionsModel(cli_); // clang-format off switch(mode_) { case cli::mode::training: addOptionsTraining(cli_); addOptionsValidation(cli_); break; case cli::mode::translation: addOptionsTranslation(cli_); break; case cli::mode::scoring: addOptionsScoring(cli_); break; case cli::mode::embedding: addOptionsEmbedding(cli_); break; default: ABORT("wrong CLI mode"); break; } addAliases(cli_); // clang-format on } void ConfigParser::addOptionsGeneral(cli::CLIWrapper& cli) { int defaultWorkspace = (mode_ == cli::mode::translation) ? 512 : 2048; cli.switchGroup("General options"); // clang-format off cli.add("--authors", "Print list of authors and exit"); cli.add("--cite", "Print citation and exit"); cli.add("--build-info", "Print CMake build options and exit. Set to 'all' to print advanced options") ->implicit_val("basic"); cli.add>("--config,-c", "Configuration file(s). If multiple, later overrides earlier"); cli.add("--workspace,-w", "Preallocate arg MB of work space", defaultWorkspace); cli.add("--log", "Log training process information to file given by arg"); cli.add("--log-level", "Set verbosity level of logging: trace, debug, info, warn, err(or), critical, off", "info"); cli.add("--log-time-zone", "Set time zone for the date shown on logging"); cli.add("--quiet", "Suppress all logging to stderr. Logging to files still works"); cli.add("--quiet-translation", "Suppress logging for translation"); cli.add("--seed", "Seed for all random number generators. 0 means initialize randomly"); cli.add("--check-nan", "Check for NaNs or Infs in forward and backward pass. Will abort when found. " "This is a diagnostic option that will slow down computation significantly"); cli.add("--interpolate-env-vars", "allow the use of environment variables in paths, of the form ${VAR_NAME}"); cli.add("--relative-paths", "All paths are relative to the config file location"); cli.add("--dump-config", "Dump current (modified) configuration to stdout and exit. Possible values: full, minimal, expand") ->implicit_val("full"); if(mode_ == cli::mode::training) { // --sigterm is deliberately not a boolean, to allow for a consistent // pattern of specifying custom signal handling in the future. // (e.g., dump model but continue training upon SIGUSR1, or report current // training status upon SIGINFO.) cli.add("--sigterm", "What to do with SIGTERM: save-and-exit or exit-immediately.", "save-and-exit"); } // clang-format on } void ConfigParser::addOptionsServer(cli::CLIWrapper& cli) { // clang-format off auto previous_group = cli.switchGroup("Server options"); cli.add("--port,-p", "Port number for web socket server", 8080); cli.switchGroup(previous_group); // clang-format on } void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) { auto previous_group = cli.switchGroup("Model options"); // clang-format off if(mode_ == cli::mode::translation) { cli.add>("--models,-m", "Paths to model(s) to be loaded. Supported file extensions: .npz, .bin"); } else { cli.add("--model,-m", "Path prefix for model to be saved/resumed. Supported file extensions: .npz, .bin", "model.npz"); if(mode_ == cli::mode::training) { cli.add("--pretrained-model", "Path prefix for pre-trained model to initialize model weights"); } } #ifdef COMPILE_CPU if(mode_ == cli::mode::translation) { cli.add("--model-mmap", "Use memory-mapping when loading model (CPU only)"); } #endif cli.add("--ignore-model-config", "Ignore the model configuration saved in npz file"); cli.add("--type", "Model type: amun, nematus, s2s, multi-s2s, transformer", "amun"); cli.add>("--dim-vocabs", "Maximum items in vocabulary ordered by rank, 0 uses all items in the provided/created vocabulary file", {0, 0}); cli.add("--dim-emb", "Size of embedding vector", 512); cli.add("--factors-dim-emb", "Embedding dimension of the factors. Only used if concat is selected as factors combining form"); cli.add("--factors-combine", "How to combine the factors and lemma embeddings. Options available: sum, concat", "sum"); cli.add("--lemma-dependency", "Lemma dependency method to use when predicting target factors. Options: soft-transformer-layer, hard-transformer-layer, lemma-dependent-bias, re-embedding"); cli.add("--lemma-dim-emb", "Re-embedding dimension of lemma in factors", 0); cli.add("--dim-rnn", "Size of rnn hidden state", 1024); cli.add("--enc-type", "Type of encoder RNN : bidirectional, bi-unidirectional, alternating (s2s)", "bidirectional"); cli.add("--enc-cell", "Type of RNN cell: gru, lstm, tanh (s2s)", "gru"); cli.add("--enc-cell-depth", "Number of transitional cells in encoder layers (s2s)", 1); cli.add("--enc-depth", "Number of encoder layers (s2s)", 1); cli.add("--dec-cell", "Type of RNN cell: gru, lstm, tanh (s2s)", "gru"); cli.add("--dec-cell-base-depth", "Number of transitional cells in first decoder layer (s2s)", 2); cli.add("--dec-cell-high-depth", "Number of transitional cells in next decoder layers (s2s)", 1); cli.add("--dec-depth", "Number of decoder layers (s2s)", 1); cli.add("--skip", "Use skip connections (s2s)"); cli.add("--layer-normalization", "Enable layer normalization"); cli.add("--right-left", "Train right-to-left model"); cli.add>("--input-types", "Provide type of input data if different than 'sequence'. " "Possible values: sequence, class, alignment, weight. " "You need to provide one type per input file (if --train-sets) or per TSV field (if --tsv).", {}); cli.add("--best-deep", "Use Edinburgh deep RNN configuration (s2s)"); cli.add("--tied-embeddings", "Tie target embeddings and output embeddings in output layer"); cli.add("--tied-embeddings-src", "Tie source and target embeddings"); cli.add("--tied-embeddings-all", "Tie all embedding layers and output layer"); cli.add("--output-omit-bias", "Do not use a bias vector in decoder output layer"); // Transformer options cli.add("--transformer-heads", "Number of heads in multi-head attention (transformer)", 8); cli.add("--transformer-no-projection", "Omit linear projection after multi-head attention (transformer)"); cli.add("--transformer-pool", "Pool encoder states instead of using cross attention (selects first encoder state, best used with special token)"); cli.add("--transformer-dim-ffn", "Size of position-wise feed-forward network (transformer)", 2048); cli.add("--transformer-decoder-dim-ffn", "Size of position-wise feed-forward network in decoder (transformer). Uses --transformer-dim-ffn if 0.", 0); cli.add("--transformer-ffn-depth", "Depth of filters (transformer)", 2); cli.add("--transformer-decoder-ffn-depth", "Depth of filters in decoder (transformer). Uses --transformer-ffn-depth if 0", 0); cli.add("--transformer-ffn-activation", "Activation between filters: swish or relu (transformer)", "swish"); cli.add("--transformer-dim-aan", "Size of position-wise feed-forward network in AAN (transformer)", 2048); cli.add("--transformer-aan-depth", "Depth of filter for AAN (transformer)", 2); cli.add("--transformer-aan-activation", "Activation between filters in AAN: swish or relu (transformer)", "swish"); cli.add("--transformer-aan-nogate", "Omit gate in AAN (transformer)"); cli.add("--transformer-decoder-autoreg", "Type of autoregressive layer in transformer decoder: self-attention, average-attention (transformer)", "self-attention"); cli.add>("--transformer-tied-layers", "List of tied decoder layers (transformer)"); cli.add("--transformer-guided-alignment-layer", "Last or number of layer to use for guided alignment training in transformer", "last"); cli.add("--transformer-preprocess", "Operation before each transformer layer: d = dropout, a = add, n = normalize"); cli.add("--transformer-postprocess-emb", "Operation after transformer embedding layer: d = dropout, a = add, n = normalize", "d"); cli.add("--transformer-postprocess", "Operation after each transformer layer: d = dropout, a = add, n = normalize", "dan"); cli.add("--transformer-postprocess-top", "Final operation after a full transformer stack: d = dropout, a = add, n = normalize. The optional skip connection with 'a' by-passes the entire stack.", ""); cli.add("--transformer-train-position-embeddings", "Train positional embeddings instead of using static sinusoidal embeddings"); cli.add("--transformer-depth-scaling", "Scale down weight initialization in transformer layers by 1 / sqrt(depth)"); cli.add("--bert-mask-symbol", "Masking symbol for BERT masked-LM training", "[MASK]"); cli.add("--bert-sep-symbol", "Sentence separator symbol for BERT next sentence prediction training", "[SEP]"); cli.add("--bert-class-symbol", "Class symbol BERT classifier training", "[CLS]"); cli.add("--bert-masking-fraction", "Fraction of masked out tokens during training", 0.15f); cli.add("--bert-train-type-embeddings", "Train bert type embeddings, set to false to use static sinusoidal embeddings", true); cli.add("--bert-type-vocab-size", "Size of BERT type vocab (sentence A and B)", 2); #ifdef CUDNN cli.add("--char-stride", "Width of max-pooling layer after convolution layer in char-s2s model", 5); cli.add("--char-highway", "Number of highway network layers after max-pooling in char-s2s model", 4); cli.add>("--char-conv-filters-num", "Numbers of convolution filters of corresponding width in char-s2s model", {200, 200, 250, 250, 300, 300, 300, 300}); cli.add>("--char-conv-filters-widths", "Convolution window widths in char-s2s model", {1, 2, 3, 4, 5, 6, 7, 8}); #endif if(mode_ == cli::mode::training) { // TODO: add ->range(0,1); cli.add("--dropout-rnn", "Scaling dropout along rnn layers and time (0 = no dropout)"); cli.add("--dropout-src", "Dropout source words (0 = no dropout)"); cli.add("--dropout-trg", "Dropout target words (0 = no dropout)"); cli.add("--transformer-dropout", "Dropout between transformer layers (0 = no dropout)"); cli.add("--transformer-dropout-attention", "Dropout for transformer attention (0 = no dropout)"); cli.add("--transformer-dropout-ffn", "Dropout for transformer filter (0 = no dropout)"); } cli.switchGroup(previous_group); // clang-format on } void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) { auto previous_group = cli.switchGroup("Training options"); // clang-format off cli.add("--cost-type", // @TODO: rename to loss-type "Optimization criterion: ce-mean, ce-mean-words, ce-sum, perplexity", "ce-sum"); cli.add("--multi-loss-type", "How to accumulate multi-objective losses: sum, scaled, mean", "sum"); cli.add("--unlikelihood-loss", "Use word-level weights as indicators for sequence-level unlikelihood training"); cli.add("--overwrite", "Do not create model checkpoints, only overwrite main model file with last checkpoint. " "Reduces disk usage"); cli.add("--no-reload", "Do not load existing model specified in --model arg"); cli.add>("--train-sets,-t", "Paths to training corpora: source target"); cli.add>("--vocabs,-v", "Paths to vocabulary files have to correspond to --train-sets. " "If this parameter is not supplied we look for vocabulary files " "source.{yml,json} and target.{yml,json}. " "If these files do not exist they are created"); #ifdef USE_SENTENCEPIECE cli.add>("--sentencepiece-alphas", "Sampling factors for SentencePiece vocabulary; i-th factor corresponds to i-th vocabulary"); cli.add("--sentencepiece-options", "Pass-through command-line options to SentencePiece trainer"); cli.add("--sentencepiece-max-lines", "Maximum lines to train SentencePiece vocabulary, selected with sampling from all data. " "When set to 0 all lines are going to be used.", 2000000); #endif // scheduling options // @TODO: these should be re-defined as aliases for `--after` but the current frame work matches on value, so not doable. cli.add("--after-epochs,-e", "Finish after this many epochs, 0 is infinity (deprecated, '--after-epochs N' corresponds to '--after Ne')"); // @TODO: replace with alias cli.add("--after-batches", "Finish after this many batch updates, 0 is infinity (deprecated, '--after-batches N' corresponds to '--after Nu')"); // @TODO: replace with alias cli.add("--after,-a", "Finish after this many chosen training units, 0 is infinity (e.g. 100e = 100 epochs, 10Gt = 10 billion target labels, 100Ku = 100,000 updates", "0e"); cli.add("--disp-freq", "Display information every arg updates (append 't' for every arg target labels)", "1000u"); cli.add("--disp-first", "Display information for the first arg updates"); cli.add("--disp-label-counts", "Display label counts when logging loss progress", true); // cli.add("--disp-label-index", // "Display label counts based on i-th input stream (-1 is last)", -1); cli.add("--save-freq", "Save model file every arg updates (append 't' for every arg target labels)", "10000u"); cli.add>("--logical-epoch", "Redefine logical epoch counter as multiple of data epochs (e.g. 1e), updates (e.g. 100Ku) or labels (e.g. 1Gt). " "Second parameter defines width of fractional display, 0 by default.", {"1e", "0"}); addSuboptionsInputLength(cli); addSuboptionsTSV(cli); // data management options cli.add("--shuffle", "How to shuffle input data (data: shuffles data and sorted batches; batches: " "data is read in order into batches, but batches are shuffled; none: no shuffling). " "Use with '--maxi-batch-sort none' in order to achieve exact reading order", "data"); cli.add("--no-shuffle", "Shortcut for backwards compatiblity, equivalent to --shuffle none (deprecated)"); cli.add("--no-restore-corpus", "Skip restoring corpus state after training is restarted"); cli.add("--tempdir,-T", "Directory for temporary (shuffled) files and database", "/tmp"); cli.add("--sqlite", "Use disk-based sqlite3 database for training corpus storage, default" " is temporary with path creates persistent storage") ->implicit_val("temporary"); cli.add("--sqlite-drop", "Drop existing tables in sqlite3 database"); addSuboptionsDevices(cli); addSuboptionsBatching(cli); // optimizer options cli.add("--optimizer,-o", "Optimization algorithm: sgd, adagrad, adam", "adam"); cli.add>("--optimizer-params", "Parameters for optimization algorithm, e.g. betas for Adam. " "Auto-adjusted to --mini-batch-words-ref if given"); cli.add("--optimizer-delay", "SGD update delay (#batches between updates). 1 = no delay. " "Can be fractional, e.g. 0.1 to use only 10% of each batch", 1.f); cli.add("--sync-sgd", "Use synchronous SGD instead of asynchronous for multi-gpu training"); // learning rate options cli.add("--learn-rate,-l", "Learning rate. " "Auto-adjusted to --mini-batch-words-ref if given", 0.0001f); cli.add("--lr-report", "Report learning rate for each update"); cli.add("--lr-decay", "Per-update decay factor for learning rate: lr <- lr * arg (0 to disable)"); cli.add("--lr-decay-strategy", "Strategy for learning rate decaying: epoch, batches, stalled, epoch+batches, epoch+stalled", "epoch+stalled"); cli.add>("--lr-decay-start", "The first number of (epoch, batches, stalled) validations to start learning rate decaying (tuple)", {10, 1}); cli.add("--lr-decay-freq", "Learning rate decaying frequency for batches, requires --lr-decay-strategy to be batches", 50000); cli.add("--lr-decay-reset-optimizer", "Reset running statistics of optimizer whenever learning rate decays"); cli.add("--lr-decay-repeat-warmup", "Repeat learning rate warmup when learning rate is decayed"); cli.add>("--lr-decay-inv-sqrt", "Decrease learning rate at arg / sqrt(no. batches) starting at arg (append 't' or 'e' for sqrt(target labels or epochs)). " "Add second argument to define the starting point (default: same as first value)", {"0"}); cli.add("--lr-warmup", "Increase learning rate linearly for arg first batches (append 't' for arg first target labels)", "0"); cli.add("--lr-warmup-start-rate", "Start value for learning rate warmup"); cli.add("--lr-warmup-cycle", "Apply cyclic warmup"); cli.add("--lr-warmup-at-reload", "Repeat warmup after interrupted training"); cli.add("--label-smoothing", "Epsilon for label smoothing (0 to disable)"); cli.add("--factor-weight", "Weight for loss function for factors (factored vocab only) (1 to disable)", 1.0f); cli.add("--clip-norm", "Clip gradient norm to arg (0 to disable)", 1.f); // @TODO: this is currently wrong with ce-sum and should rather be disabled or fixed by multiplying with labels cli.add("--exponential-smoothing", "Maintain smoothed version of parameters for validation and saving with smoothing factor. 0 to disable. " "Auto-adjusted to --mini-batch-words-ref if given.", 0.f)->implicit_val("1e-4"); cli.add("--guided-alignment", "Path to a file with word alignments. Use guided alignment to guide attention or 'none'. " "If --tsv it specifies the index of a TSV field that contains the alignments (0-based)", "none"); cli.add("--guided-alignment-cost", "Cost type for guided alignment: ce (cross-entropy), mse (mean square error), mult (multiplication)", "ce"); cli.add("--guided-alignment-weight", "Weight for guided alignment cost", 0.1); cli.add("--data-weighting", "Path to a file with sentence or word weights. " "If --tsv it specifies the index of a TSV field that contains the weights (0-based)"); cli.add("--data-weighting-type", "Processing level for data weighting: sentence, word", "sentence"); // embedding options cli.add>("--embedding-vectors", "Paths to files with custom source and target embedding vectors"); cli.add("--embedding-normalization", "Normalize values from custom embedding vectors to [-1, 1]"); cli.add("--embedding-fix-src", "Fix source embeddings. Affects all encoders"); cli.add("--embedding-fix-trg", "Fix target embeddings. Affects all decoders"); // mixed precision training cli.add("--fp16", "Shortcut for mixed precision training with float16 and cost-scaling, " "corresponds to: --precision float16 float32 --cost-scaling 256.f 1000 2.f 256.f"); cli.add>("--precision", "Mixed precision training for forward/backward pass and optimizaton. " "Defines types for: forward/backward pass, optimization.", {"float32", "float32"}); cli.add>("--cost-scaling", "Dynamic cost scaling for mixed precision training: " "scaling factor, frequency, multiplier, minimum factor") ->implicit_val("256.f 1000 2.f 256.f"); cli.add("--gradient-norm-average-window", "Window size over which the exponential average of the gradient norm is recorded (for logging and scaling). " "After this many updates about 90% of the mass of the exponential average comes from these updates", 100); cli.add>("--dynamic-gradient-scaling", "Re-scale gradient to have average gradient norm if (log) gradient norm diverges from average by arg1 sigmas. " "If arg2 = \"log\" the statistics are recorded for the log of the gradient norm else use plain norm") ->implicit_val("2.f log"); cli.add("--check-gradient-nan", "Skip parameter update in case of NaNs in gradient"); cli.add("--normalize-gradient", "Normalize gradient by multiplying with no. devices / total labels (not recommended and to be removed in the future)"); cli.add>("--train-embedder-rank", "Override model configuration and train a embedding similarity ranker with the model encoder, " "parameters encode margin and an optional normalization factor") ->implicit_val("0.3f 0.0f"); // model quantization training addSuboptionsQuantization(cli); // add ULR settings addSuboptionsULR(cli); cli.add>("--task", "Use predefined set of options. Possible values: transformer-base, transformer-big, " "transformer-base-prenorm, transformer-big-prenorm"); cli.switchGroup(previous_group); // clang-format on } void ConfigParser::addOptionsValidation(cli::CLIWrapper& cli) { auto previous_group = cli.switchGroup("Validation set options"); // clang-format off cli.add>("--valid-sets", "Paths to validation corpora: source target"); cli.add("--valid-freq", "Validate model every arg updates (append 't' for every arg target labels)", "10000u"); cli.add>("--valid-metrics", "Metric to use during validation: cross-entropy, ce-mean-words, perplexity, valid-script, " "translation, bleu, bleu-detok (deprecated, same as bleu), bleu-segmented, chrf. " "Multiple metrics can be specified", {"cross-entropy"}); cli.add("--valid-reset-stalled", "Reset all stalled validation metrics when the training is restarted"); cli.add("--early-stopping", "Stop if the first validation metric does not improve for arg consecutive validation steps", 10); cli.add("--early-stopping-on", "Decide if early stopping should take into account first, all, or any validation metrics" "Possible values: first, all, any", "first"); // decoding options cli.add("--beam-size,-b", "Beam size used during search with validating translator", 12); cli.add("--normalize,-n", "Divide translation score by pow(translation length, arg)", 0)->implicit_val("1"); cli.add("--max-length-factor", "Maximum target length as source length times factor", 3); cli.add("--word-penalty", "Subtract (arg * translation length) from translation score"); cli.add("--allow-unk", "Allow unknown words to appear in output"); cli.add("--n-best", "Generate n-best list"); cli.add("--word-scores", "Print word-level scores. One score per subword unit, not normalized even if --normalize"); // efficiency options cli.add("--valid-mini-batch", "Size of mini-batch used during validation", 32); cli.add("--valid-max-length", "Maximum length of a sentence in a validating sentence pair. " "Sentences longer than valid-max-length are cropped to valid-max-length", 1000); // options for validation script cli.add("--valid-script-path", "Path to external validation script." " It should print a single score to stdout." " If the option is used with validating translation, the output" " translation file will be passed as a first argument"); cli.add>("--valid-script-args", "Additional args passed to --valid-script-path. These are inserted" " between the script path and the output translation-file path"); cli.add("--valid-translation-output", "(Template for) path to store the translation. " "E.g., validation-output-after-{U}-updates-{T}-tokens.txt. Template " "parameters: {E} for epoch; {B} for No. of batches within epoch; " "{U} for total No. of updates; {T} for total No. of tokens seen."); cli.add("--keep-best", "Keep best model for each validation metric"); cli.add("--valid-log", "Log validation scores to file given by arg"); cli.switchGroup(previous_group); // clang-format on } void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) { auto previous_group = cli.switchGroup("Translator options"); // clang-format off cli.add>("--input,-i", "Paths to input file(s), stdin by default", {"stdin"}); cli.add("--output,-o", "Path to output file, stdout by default", "stdout"); cli.add>("--vocabs,-v", "Paths to vocabulary files have to correspond to --input"); // decoding options cli.add("--beam-size,-b", "Beam size used during search with validating translator", 12); cli.add("--normalize,-n", "Divide translation score by pow(translation length, arg)", 0)->implicit_val("1"); cli.add("--max-length-factor", "Maximum target length as source length times factor", 3); cli.add("--word-penalty", "Subtract (arg * translation length) from translation score"); cli.add("--allow-unk", "Allow unknown words to appear in output"); cli.add("--allow-special", "Allow special symbols to appear in output, e.g. for SentencePiece with byte-fallback do not suppress the newline symbol"); cli.add("--n-best", "Generate n-best list"); cli.add("--alignment", "Return word alignment. Possible values: 0.0-1.0, hard, soft") ->implicit_val("1"); cli.add("--word-scores", "Print word-level scores. One score per subword unit, not normalized even if --normalize"); cli.add("--stat-freq", "Display speed information every arg mini-batches. Disabled by default with 0, set to value larger than 0 to activate", "0"); #ifdef USE_SENTENCEPIECE cli.add("--no-spm-decode", "Keep the output segmented into SentencePiece subwords"); #endif addSuboptionsInputLength(cli); addSuboptionsTSV(cli); addSuboptionsDevices(cli); addSuboptionsBatching(cli); cli.add("--fp16", "Shortcut for mixed precision inference with float16, corresponds to: --precision float16"); cli.add>("--precision", "Mixed precision for inference, set parameter type in expression graph", {"float32"}); cli.add("--skip-cost", "Ignore model cost during translation, not recommended for beam-size > 1"); cli.add>("--shortlist", "Use softmax shortlist: path first best prune"); cli.add>("--weights", "Scorer weights"); cli.add>("--output-sampling", "Noise output layer with gumbel noise. Implicit default is 'full' for sampling from full distribution. " " Also accepts 'topk num' (e.g. topk 100) for top-100 sampling.") ->implicit_val("full"); cli.add>("--output-approx-knn", "Use approximate knn search in output layer (currently only in transformer)") ->implicit_val("100 1024"); // parameters for on-line quantization cli.add("--optimize", "Optimize the graph on-the-fly", false); cli.add("--gemm-type,-g", "GEMM Type to be used for on-line quantization/packing: float32, packed16, packed8", "float32"); cli.add("--quantize-range", "Range for the on-line quantiziation of weight matrix in multiple of this range and standard deviation, 0.0 means min/max quantization", 0.f); #if 0 // @TODO: Ask Hany if there are any decoding-time options // add ULR settings addSuboptionsULR(cli); #endif cli.switchGroup(previous_group); // clang-format on } void ConfigParser::addOptionsScoring(cli::CLIWrapper& cli) { auto previous_group = cli.switchGroup("Scorer options"); // clang-format off cli.add("--no-reload", "Do not load existing model specified in --model arg"); // TODO: move options like vocabs and train-sets to a separate procedure as they are defined twice cli.add>("--train-sets,-t", "Paths to corpora to be scored: source target"); cli.add("--output,-o", "Path to output file, stdout by default", "stdout"); cli.add>("--vocabs,-v", "Paths to vocabulary files have to correspond to --train-sets. " "If this parameter is not supplied we look for vocabulary files source.{yml,json} and target.{yml,json}. " "If these files do not exists they are created"); cli.add("--n-best", "Score n-best list instead of plain text corpus"); cli.add("--n-best-feature", "Feature name to be inserted into n-best list", "Score"); cli.add("--normalize,-n", "Divide translation score by translation length"); cli.add("--summary", "Only print total cost, possible values: cross-entropy (ce-mean), ce-mean-words, ce-sum, perplexity") ->implicit_val("cross-entropy"); cli.add("--alignment", "Return word alignments. Possible values: 0.0-1.0, hard, soft") ->implicit_val("1"), cli.add("--word-scores", "Print word-level scores. One score per subword unit, not normalized even if --normalize"); addSuboptionsInputLength(cli); addSuboptionsTSV(cli); addSuboptionsDevices(cli); addSuboptionsBatching(cli); cli.add("--fp16", "Shortcut for mixed precision inference with float16, corresponds to: --precision float16"); cli.add>("--precision", "Mixed precision for inference, set parameter type in expression graph", {"float32"}); // parameters for on-line quantization cli.add("--optimize", "Optimize the graph on-the-fly", false); cli.add("--gemm-type,-g", "GEMM Type to be used for on-line quantization/packing: float32, packed16, packed8", "float32"); cli.add("--quantize-range", "Range for the on-line quantiziation of weight matrix in multiple of this range and standard deviation, 0.0 means min/max quantization", 0.f); cli.switchGroup(previous_group); // clang-format on } void ConfigParser::addOptionsEmbedding(cli::CLIWrapper& cli) { auto previous_group = cli.switchGroup("Scorer options"); // clang-format off cli.add("--no-reload", "Do not load existing model specified in --model arg"); // TODO: move options like vocabs and train-sets to a separate procedure as they are defined twice cli.add>("--train-sets,-t", "Paths to corpora to be scored: source target"); cli.add("--output,-o", "Path to output file, stdout by default", "stdout"); cli.add>("--vocabs,-v", "Paths to vocabulary files have to correspond to --train-sets. " "If this parameter is not supplied we look for vocabulary files source.{yml,json} and target.{yml,json}. " "If these files do not exists they are created"); cli.add("--compute-similarity", "Expect two inputs and compute cosine similarity instead of outputting embedding vector"); cli.add("--binary", "Output vectors as binary floats"); addSuboptionsInputLength(cli); addSuboptionsTSV(cli); addSuboptionsDevices(cli); addSuboptionsBatching(cli); cli.add("--fp16", "Shortcut for mixed precision inference with float16, corresponds to: --precision float16"); cli.add>("--precision", "Mixed precision for inference, set parameter type in expression graph. Supported values: float32, float16", {"float32"}); cli.switchGroup(previous_group); // clang-format on } void ConfigParser::addSuboptionsDevices(cli::CLIWrapper& cli) { // clang-format off cli.add>("--devices,-d", "Specifies GPU ID(s) to use for training. Defaults to 0..num-devices-1", {"0"}); cli.add("--num-devices", "Number of GPUs to use for this process. Defaults to length(devices) or 1"); #ifdef USE_NCCL if(mode_ == cli::mode::training) { cli.add("--no-nccl", "Disable inter-GPU communication via NCCL"); cli.add("--sharding", "When using NCCL and MPI for multi-process training use 'global' (default, less memory usage) " "or 'local' (more memory usage but faster) sharding", {"global"}); cli.add("--sync-freq", "When sharding is local sync all shards across processes once every n steps (possible units u=updates, t=target labels, e=epochs)", "200u"); } #endif #ifdef CUDA_FOUND cli.add("--cpu-threads", "Use CPU-based computation with this many independent threads, 0 means GPU-based computation", 0) ->implicit_val("1"); #else cli.add("--cpu-threads", "Use CPU-based computation with this many independent threads, 0 means GPU-based computation", 1); #endif // clang-format on } void ConfigParser::addSuboptionsBatching(cli::CLIWrapper& cli) { int defaultMiniBatch = (mode_ == cli::mode::translation) ? 1 : 64; int defaultMaxiBatch = (mode_ == cli::mode::translation) ? 1 : 100; std::string defaultMaxiBatchSort = (mode_ == cli::mode::translation) ? "none" : "trg"; // clang-format off cli.add("--mini-batch", // set accurate help messages for translation, scoring, or training (mode_ == cli::mode::translation) ? "Size of mini-batch used during batched translation" : (mode_ == cli::mode::scoring) ? "Size of mini-batch used during batched scoring" : "Size of mini-batch used during update", defaultMiniBatch); cli.add("--mini-batch-words", "Set mini-batch size based on words instead of sentences"); if(mode_ == cli::mode::training) { cli.add("--mini-batch-fit", "Determine mini-batch size automatically based on sentence-length to fit reserved memory"); cli.add("--mini-batch-fit-step", "Step size for mini-batch-fit statistics", 10); cli.add("--gradient-checkpointing", "Enable gradient-checkpointing to minimize memory usage"); } cli.add("--maxi-batch", "Number of batches to preload for length-based sorting", defaultMaxiBatch); cli.add("--maxi-batch-sort", "Sorting strategy for maxi-batch: none, src, trg (not available for decoder)", defaultMaxiBatchSort); if(mode_ == cli::mode::training) { cli.add("--shuffle-in-ram", "Keep shuffled corpus in RAM, do not write to temp file"); #if DETERMINISTIC cli.add("--data-threads", "Number of concurrent threads to use during data reading and processing", 1); #else cli.add("--data-threads", "Number of concurrent threads to use during data reading and processing", 8); #endif // @TODO: Consider making the next two options options of the vocab instead, to make it more local in scope. cli.add("--all-caps-every", "When forming minibatches, preprocess every Nth line on the fly to all-caps. Assumes UTF-8"); cli.add("--english-title-case-every", "When forming minibatches, preprocess every Nth line on the fly to title-case. Assumes English (ASCII only)"); cli.add("--mini-batch-words-ref", "If given, the following hyper parameters are adjusted as-if we had this mini-batch size: " "--learn-rate, --optimizer-params, --exponential-smoothing, --mini-batch-warmup"); cli.add("--mini-batch-warmup", "Linear ramp-up of MB size, up to this #updates (append 't' for up to this #target labels). " "Auto-adjusted to --mini-batch-words-ref if given", {"0"}); cli.add("--mini-batch-track-lr", "Dynamically track mini-batch size inverse to actual learning rate (not considering lr-warmup)"); cli.add("--mini-batch-round-up", "Round up batch size to next power of 2 for more efficient training, but this can make batch size less stable. Disable with --mini-batch-round-up=false", true); } else { #if DETERMINISTIC cli.add("--data-threads", "Number of concurrent threads to use during data reading and processing", 1); #else cli.add("--data-threads", "Number of concurrent threads to use during data reading and processing", 8); #endif } // clang-format on } void ConfigParser::addSuboptionsInputLength(cli::CLIWrapper& cli) { size_t defaultMaxLength = (mode_ == cli::mode::training) ? 50 : 1000; // clang-format off cli.add("--max-length", "Maximum length of a sentence in a training sentence pair", defaultMaxLength); cli.add("--max-length-crop", "Crop a sentence to max-length instead of omitting it if longer than max-length"); // clang-format on } void ConfigParser::addSuboptionsTSV(cli::CLIWrapper& cli) { // clang-format off cli.add("--tsv", "Tab-separated input"); cli.add("--tsv-fields", "Number of fields in the TSV input. By default, it is guessed based on the model type"); // clang-format on } void ConfigParser::addSuboptionsULR(cli::CLIWrapper& cli) { // clang-format off // support for universal encoder ULR https://arxiv.org/pdf/1802.05368.pdf cli.add("--ulr", "Enable ULR (Universal Language Representation)"); // reading pre-trained universal embeddings for multi-sources. // Note that source and target here is relative to ULR not the translation langs // queries: EQ in Fig2 : is the unified embeddings projected to one space. cli.add("--ulr-query-vectors", "Path to file with universal sources embeddings from projection into universal space", ""); // keys: EK in Fig2 : is the keys of the target embeddings projected to unified space (i.e. ENU in // multi-lingual case) cli.add("--ulr-keys-vectors", "Path to file with universal sources embeddings of target keys from projection into universal space", ""); cli.add("--ulr-trainable-transformation", "Make Query Transformation Matrix A trainable"); cli.add("--ulr-dim-emb", "ULR monolingual embeddings dimension"); cli.add("--ulr-dropout", "ULR dropout on embeddings attentions. Default is no dropout", 0.0f); cli.add("--ulr-softmax-temperature", "ULR softmax temperature to control randomness of predictions. Deafult is 1.0: no temperature", 1.0f); // clang-format on } void ConfigParser::addSuboptionsQuantization(cli::CLIWrapper& cli) { // clang-format off // model quantization training cli.add("--quantize-bits", "Number of bits to compress model to. Set to 0 to disable", 0); cli.add("--quantize-optimization-steps", "Adjust quantization scaling factor for N steps", 0); cli.add("--quantize-log-based", "Uses log-based quantization"); cli.add("--quantize-biases", "Apply quantization to biases"); // clang-format on } cli::mode ConfigParser::getMode() const { return mode_; } Ptr ConfigParser::parseOptions(int argc, char** argv, bool doValidate) { cmdLine_ = escapeCmdLine(argc,argv); // parse command-line options and fill wrapped YAML config cli_.parse(argc, argv); if(get("authors")) { std::cerr << authors() << std::endl; exit(0); } if(get("cite")) { std::cerr << citation() << std::endl; exit(0); } auto buildInfo = get("build-info"); if(!buildInfo.empty() && buildInfo != "false") { #ifdef BUILD_INFO_AVAILABLE // cmake build options are not available on MSVC based build. if(buildInfo == "all") std::cerr << cmakeBuildOptionsAdvanced() << std::endl; else std::cerr << cmakeBuildOptions() << std::endl; exit(0); #else // BUILD_INFO_AVAILABLE ABORT("build-info is not available on MSVC based build unless compiled via CMake."); #endif // BUILD_INFO_AVAILABLE } // get paths to extra config files auto configPaths = findConfigPaths(); if(!configPaths.empty()) { auto config = loadConfigFiles(configPaths); cli_.updateConfig(config, cli::OptionPriority::ConfigFile, "There are option(s) in a config file that are not expected"); } if(get("interpolate-env-vars")) { cli::processPaths(config_, cli::interpolateEnvVars, PATHS); } // Option shortcuts for input from STDIN for trainer and scorer if(mode_ == cli::mode::training || mode_ == cli::mode::scoring) { auto trainSets = get>("train-sets"); YAML::Node config; // Assume the input will come from STDIN if --tsv is set but no --train-sets are given if(get("tsv") && trainSets.empty()) { config["train-sets"].push_back("stdin"); // Assume the input is in TSV format if --train-sets is set to "stdin" } else if(trainSets.size() == 1 && (trainSets[0] == "stdin" || trainSets[0] == "-")) { config["tsv"] = true; } if(!config.IsNull()) cli_.updateConfig(config, cli::OptionPriority::CommandLine, "A shortcut for STDIN failed."); } if(doValidate) { ConfigValidator(config_).validateOptions(mode_); } // remove extra config files from the config to avoid redundancy config_.remove("config"); // dump config and exit if(!get("dump-config").empty() && get("dump-config") != "false") { auto dumpMode = get("dump-config"); config_.remove("dump-config"); if(dumpMode == "expand") { cli_.parseAliases(); } bool minimal = (dumpMode == "minimal" || dumpMode == "expand"); std::cout << cli_.dumpConfig(minimal) << std::endl; exit(0); } // For TSV input, it is possible to use --input-types to determine fields that contain alignments // or weights. In such case, the position of 'alignment' input type in --input-types determines // the index of a TSV field that contains word alignments, and respectively, the position of // 'weight' in --input-types determines the index of a TSV field that contains weights. // Marian will abort if both the --guided-alignment and 'alignment' in --input-types are specified // (or --data-weighting and 'weight'). // // Note: this may modify the config, so it is safer to do it after --dump-config. if(mode_ == cli::mode::training || get("tsv")) { auto inputTypes = get>("input-types"); if(!inputTypes.empty()) { bool seenAligns = false; bool seenWeight = false; YAML::Node config; for(size_t i = 0; i < inputTypes.size(); ++i) { if(inputTypes[i] == "alignment") { ABORT_IF(seenAligns, "You can specify 'alignment' only once in input-types"); ABORT_IF(has("guided-alignment") && get("guided-alignment") != "none", "You must use either guided-alignment or 'alignment' in input-types"); config["guided-alignment"] = std::to_string(i); seenAligns = true; } if(inputTypes[i] == "weight") { ABORT_IF(seenWeight, "You can specify 'weight' only once in input-types"); ABORT_IF(has("data-weighting") && !get("data-weighting").empty(), "You must use either data-weighting or 'weight' in input-types"); config["data-weighting"] = std::to_string(i); seenWeight = true; } } if(!config.IsNull()) cli_.updateConfig(config, cli::OptionPriority::CommandLine, "Extracting 'alignment' and 'weight' types from input-types failed."); } } #if 0 // @TODO: remove once fully deprecated // Convert --after-batches N to --after Nu and --after-epochs N to --after Ne, different values get concatenated with "," if(mode_ == cli::mode::training && get("after-epochs") > 0) { auto afterValue = get("after-epochs"); LOG(info, "\"--after-epochs {}\" is deprecated, please use \"--after {}e\" instead (\"e\" stands for epoch)", afterValue, afterValue); YAML::Node config; std::string prevAfter = get("after"); std::string converted = std::to_string(afterValue) + "e"; if(prevAfter != "0e") config["after"] = prevAfter + "," + converted; else config["after"] = converted; if(!config.IsNull()) cli_.updateConfig(config, cli::OptionPriority::CommandLine, "Could not update --after with value from --after-epochs"); } if(mode_ == cli::mode::training && get("after-batches") > 0) { auto afterValue = get("after-batches"); LOG(info, "\"--after-batches {}\" is deprecated, please use \"--after {}u\" instead (\"u\" stands for updates)", afterValue, afterValue); YAML::Node config; std::string prevAfter = get("after"); std::string converted = std::to_string(afterValue) + "u"; if(prevAfter != "0e") config["after"] = prevAfter + "," + converted; else config["after"] = converted; if(!config.IsNull()) cli_.updateConfig(config, cli::OptionPriority::CommandLine, "Could not update --after with value from --after-updates"); } #endif cli_.parseAliases(); auto opts = New(); opts->merge(Config(*this).get()); return opts; } std::vector ConfigParser::findConfigPaths() { std::vector paths; bool interpolateEnvVars = get("interpolate-env-vars"); bool loadConfig = !config_["config"].as>().empty(); if(loadConfig) { paths = config_["config"].as>(); for(auto& path : paths) { // (note: this updates the paths array) if(interpolateEnvVars) path = cli::interpolateEnvVars(path); } } else if(mode_ == cli::mode::training) { auto path = config_["model"].as() + ".yml"; if(interpolateEnvVars) path = cli::interpolateEnvVars(path); bool reloadConfig = filesystem::exists(path) && !get("no-reload"); if(reloadConfig) paths = {path}; } return paths; } YAML::Node ConfigParser::loadConfigFiles(const std::vector& paths) { YAML::Node configAll; for(auto& path : paths) { // load single config file io::InputFileStream strm(path); YAML::Node config = YAML::Load(strm); // expand relative paths if requested if(config["relative-paths"] && config["relative-paths"].as()) { // interpolate environment variables if requested in this config file or // via command-line options bool interpolateEnvVars = (config["interpolate-env-vars"] && config["interpolate-env-vars"].as()) || get("interpolate-env-vars"); if(interpolateEnvVars) cli::processPaths(config, cli::interpolateEnvVars, PATHS); // replace relative path w.r.t. the config file cli::makeAbsolutePaths(config, path, PATHS); // remove 'relative-paths' and do not spread it into other config files config.remove("relative-paths"); } // merge with previous config files, later file overrides earlier for(const auto& it : config) { configAll[it.first.as()] = YAML::Clone(it.second); } } return configAll; } const YAML::Node& ConfigParser::getConfig() const { return config_; } } // namespace marian