Program Listing for File marian_train.cpp

Return to documentation for file (src/command/marian_train.cpp)

#include <signal.h>
#include "marian.h"

#include "common/signal_handling.h"
#include "training/graph_group_async.h"
#include "training/graph_group_singleton.h"
#include "training/graph_group_sync.h"
#include "training/training.h"

#include "3rd_party/ExceptionWithCallStack.h"

int main(int argc, char** argv) {
  using namespace marian;
  auto options = parseOptions(argc, argv, cli::mode::training);

  // --sync-sgd always selects SyncGraphGroup
  //
  // If given, then this implementation is used for all combinations of (single, multiple) MPI
  // processes x (single, multiple) GPUs per MPI process.  This variant is presently up-to-date and
  // best supported.
  if(options->get<bool>("sync-sgd")) { // @TODO: make default
    LOG(info, "Using synchronous SGD");
    New<Train<SyncGraphGroup>>(options)->run();
  }
  else {
    auto devices = Config::getDevices(options);
    if(devices.size() == 1) {
      LOG(info, "[training] Using single-device training");
      New<Train<SyncGraphGroup>>(options)->run();
      // New<Train<SingletonGraph>>(options)->run(); // kept for reference
    } else {
      LOG(info, "Using asynchronous training");
      New<Train<AsyncGraphGroup>>(options)->run();
    }
  }
  // If we exit due to a graceful exit request via SIGTERM, exit with 128 + SIGTERM,
  // as suggested for bash in http://tldp.org/LDP/abs/html/exitcodes.html. This allows parent
  // scripts to determine if training terminated naturally or via SIGTERM.
  // An alternative would be to exit with code 124, which is what the timeout command
  // returns for timeout -s SIGTERM <seconds> ...., because exiting after SIGTERM
  // is not technically a fatal error (which is what the 128+x convention usually
  // stands for).
  exit(getSignalFlag(SIGTERM) ? 128 + SIGTERM : EXIT_SUCCESS);
}