Program Listing for File embedding.cpp

Return to documentation for file (src/layers/embedding.cpp)

#include "embedding.h"
#include "data/factored_vocab.h"

namespace marian {

Embedding::Embedding(Ptr<ExpressionGraph> graph, Ptr<Options> options)
    : LayerBase(graph, options), inference_(opt<bool>("inference")) {
  std::string name = opt<std::string>("prefix");
  int dimVoc       = opt<int>("dimVocab");
  int dimEmb       = opt<int>("dimEmb");
  int dimFactorEmb = opt<int>("dimFactorEmb");

  bool fixed = opt<bool>("fixed", false);

  // Embedding layer initialization should depend only on embedding size, hence fanIn=false
  auto initFunc = inits::glorotUniform(
      /*fanIn=*/false, /*fanOut=*/true);  // -> embedding vectors have roughly unit length

  factoredVocab_ = FactoredVocab::tryCreateAndLoad(options_->get<std::string>("vocab", ""));
  if(factoredVocab_) {
    dimVoc = (int)factoredVocab_->factorVocabSize();
    LOG_ONCE(info, "[embedding] Factored embeddings enabled");
    if(opt<std::string>("factorsCombine") == "concat") {
      ABORT_IF(dimFactorEmb == 0,
               "Embedding: If concatenation is chosen to combine the factor embeddings, a factor "
               "embedding size must be specified.");
      int numberOfFactors = (int)factoredVocab_->getTotalFactorCount();
      dimVoc -= numberOfFactors;
      FactorEmbMatrix_
          = graph_->param("factor_" + name, {numberOfFactors, dimFactorEmb}, initFunc, fixed);
      LOG_ONCE(info,
               "[embedding] Combining lemma and factors embeddings with concatenation enabled");
    }
  }

  if(options_->has("embFile")) {
    std::string file = opt<std::string>("embFile");
    if(!file.empty()) {
      bool norm = opt<bool>("normalization", false);
      initFunc  = inits::fromWord2vec(file, dimVoc, dimEmb, norm);
    }
  }

  E_ = graph_->param(name, {dimVoc, dimEmb}, initFunc, fixed);
}

/*private*/ Expr Embedding::embedWithConcat(const Words& data) const {
  auto graph = E_->graph();
  std::vector<IndexType> lemmaIndices;
  std::vector<float> factorIndices;
  factoredVocab_->lemmaAndFactorsIndexes(data, lemmaIndices, factorIndices);
  auto lemmaEmbs = rows(E_, lemmaIndices);
  int dimFactors = FactorEmbMatrix_->shape()[0];
  auto factEmbs
      = dot(graph->constant({(int)data.size(), dimFactors}, inits::fromVector(factorIndices)),
            FactorEmbMatrix_);

  return concatenate({lemmaEmbs, factEmbs}, -1);
}

// helper to embed a sequence of words (given as indices) via factored embeddings
Expr Embedding::multiRows(const Words& data, float dropProb) const {
  auto graph        = E_->graph();
  auto factoredData = factoredVocab_->csr_rows(data);
  // multi-hot factor vectors are represented as a sparse CSR matrix
  // [row index = word position index] -> set of factor indices for word at this position
  ABORT_IF(factoredData.shape
               != Shape({(int)factoredData.offsets.size() - 1 /*=rows of CSR*/, E_->shape()[0]}),
           "shape mismatch??");
  // the CSR matrix is passed in pieces
  auto weights = graph->constant({(int)factoredData.weights.size()},
                                 inits::fromVector(factoredData.weights));
  auto indices = graph->constant(
      {(int)factoredData.indices.size()}, inits::fromVector(factoredData.indices), Type::uint32);
  auto offsets = graph->constant(
      {(int)factoredData.offsets.size()}, inits::fromVector(factoredData.offsets), Type::uint32);
  // apply dropout
  // We apply it to the weights, i.e. factors get dropped out separately, but always as entire
  // vectors.
  if(!inference_)
    weights = dropout(weights, dropProb);
  // perform the product
  return csr_dot(factoredData.shape, weights, indices, offsets, E_);
}

std::tuple<Expr /*embeddings*/, Expr /*mask*/> Embedding::apply(Ptr<data::SubBatch> subBatch) const
/*override final*/ {
  auto graph   = E_->graph();
  int dimBatch = (int)subBatch->batchSize();
  int dimEmb   = (factoredVocab_ && opt<std::string>("factorsCombine") == "concat")
                   ? E_->shape()[-1] + FactorEmbMatrix_->shape()[-1]
                   : E_->shape()[-1];
  int dimWidth = (int)subBatch->batchWidth();

  // factored embeddings:
  //  - regular:
  //     - y = x @ E    x:[B x 1ofV] ; E:[V x D] ; y:[B x D]
  //  - factored:
  //     - u = x @ M    one-hot to U-dimensional multi-hot (all factors in one concatenated space)
  //        - each row of M contains the set of factors for one word => we want a CSR matrix
  //     - y = (x @ M) @ E   (x:[B x 1ofV] ; M:[V x U]) ; E:[U x D] ; y:[B x D]
  //  - first compute x @ M on the CPU
  //     - (Uvalues, Uindices, Uoffsets) = csr_rows(Mvalues, Mindices, Moffsets, subBatch->data()):
  //        - shape (U, specifically) not actually needed here
  //     - foreach input x[i]
  //        - locate row M[i,*]
  //        - copy through its index values (std::vector<push_back>)
  //     - create a matching ones vector (we can keep growing)
  //     - convert to GPU-side CSR matrix. CSR matrix now has #rows equal to len(x)
  //     - CSR matrix product with E
  //     - csr_dot(Uvalues, Uindices, Uoffsets, E_, transposeU)
  //        - double-check if all dimensions are specified. Probably not for transpose (which would
  //        be like csc_dot()).
  //  - weighting:
  //     - core factors' gradients are sums over all words that use the factors;
  //        - core factors' embeddings move very fast
  //        - words will need to make up for the move; rare words cannot
  //     - so, we multiply each factor with 1/refCount
  //        - core factors get weighed down a lot
  //        - no impact on gradients, as Adam makes up for it; embeddings still move fast just as
  //        before
  //        - but forward pass weighs them down, so that all factors are in a similar numeric range
  //        - if it is required to be in a different range, the embeddings can still learn that, but
  //        more slowly

  auto batchEmbeddings = apply(subBatch->data(), {dimWidth, dimBatch, dimEmb});

  auto batchMask = graph->constant({dimWidth, dimBatch, 1}, inits::fromVector(subBatch->mask()));
  // give the graph inputs readable names for debugging and ONNX
  batchMask->set_name("data_" + std::to_string(/*batchIndex_=*/0) + "_mask");

  return std::make_tuple(batchEmbeddings, batchMask);
}

Expr Embedding::apply(const Words& words, const Shape& shape) const /*override final*/ {
  if(factoredVocab_) {
    Expr selectedEmbs;
    if(opt<std::string>("factorsCombine") == "concat")
      selectedEmbs = embedWithConcat(words);  // [(B*W) x E]
    else
      selectedEmbs = multiRows(words, options_->get<float>("dropout", 0.0f));  // [(B*W) x E]
    selectedEmbs = reshape(selectedEmbs, shape);                               // [W, B, E]
    // selectedEmbs = dropout(selectedEmbs, options_->get<float>("dropout", 0.0f), {
    // selectedEmbs->shape()[-3], 1, 1 }); // @TODO: replace with factor dropout
    return selectedEmbs;
  } else
    return applyIndices(toWordIndexVector(words), shape);
}

Expr Embedding::applyIndices(const std::vector<WordIndex>& embIdx, const Shape& shape) const
/*override final*/ {
  ABORT_IF(factoredVocab_, "Embedding: applyIndices must not be used with a factored vocabulary");
  auto embIdxExpr = E_->graph()->indices(embIdx);
  embIdxExpr->set_name("data_"
                       + std::to_string(/*batchIndex_=*/0));  // @TODO: how to know the batch index?
  auto selectedEmbs = rows(E_, embIdxExpr);                   // [(B*W) x E]
  selectedEmbs      = reshape(selectedEmbs, shape);           // [W, B, E]
  // @BUGBUG: We should not broadcast along dimBatch=[-2]. Then we can also dropout before reshape()
  // (test that separately)
  if(!inference_)
    selectedEmbs = dropout(
        selectedEmbs, options_->get<float>("dropout", 0.0f), {selectedEmbs->shape()[-3], 1, 1});
  return selectedEmbs;
}

// standard encoder word embeddings
/*private*/ Ptr<IEmbeddingLayer> EncoderDecoderLayerBase::createEmbeddingLayer() const {
  // clang-format off
  auto options = New<Options>(
      "dimVocab",       opt<std::vector<int>>("dim-vocabs")[batchIndex_],
      "dimEmb",         opt<int>("dim-emb"),
      "dropout",        dropoutEmbeddings_,
      "inference",      inference_,
      "prefix",         (opt<bool>("tied-embeddings-src") || opt<bool>("tied-embeddings-all")) ? "Wemb"
                                                                                          : prefix_ + "_Wemb",
      "fixed",          embeddingFix_,
      "dimFactorEmb",   opt<int>("factors-dim-emb"),  // for factored embeddings
      "factorsCombine", opt<std::string>("factors-combine"),  // for factored embeddings
      "vocab",     opt<std::vector<std::string>>("vocabs")[batchIndex_]);  // for factored embeddings
  // clang-format on
  if(options_->hasAndNotEmpty("embedding-vectors")) {
    auto embFiles = opt<std::vector<std::string>>("embedding-vectors");
    options->set(
        "embFile", embFiles[batchIndex_], "normalization", opt<bool>("embedding-normalization"));
  }
  return New<Embedding>(graph_, options);
}

// ULR word embeddings
/*private*/ Ptr<IEmbeddingLayer> EncoderDecoderLayerBase::createULREmbeddingLayer() const {
  // clang-format off
  return New<ULREmbedding>(graph_, New<Options>(
      "dimSrcVoc",         opt<std::vector<int>>("dim-vocabs")[0],  // ULR multi-lingual src
      "dimTgtVoc",         opt<std::vector<int>>("dim-vocabs")[1],  // ULR monon tgt
      "dimUlrEmb",         opt<int>("ulr-dim-emb"),
      "dimEmb",            opt<int>("dim-emb"),
      "ulr-dropout",       opt<float>("ulr-dropout"),
      "dropout",           dropoutEmbeddings_,
      "inference",         inference_,
      "ulrTrainTransform", opt<bool>("ulr-trainable-transformation"),
      "ulrQueryFile",      opt<std::string>("ulr-query-vectors"),
      "ulrKeysFile",       opt<std::string>("ulr-keys-vectors")
    ));
  // clang-format on
}

// get embedding layer for this encoder or decoder
// This is lazy mostly because the constructors of the consuming objects are not
// guaranteed presently to have access to their graph.
Ptr<IEmbeddingLayer> EncoderDecoderLayerBase::getEmbeddingLayer(bool ulr) const {
  if(embeddingLayers_.size() <= batchIndex_ || !embeddingLayers_[batchIndex_]) {  // lazy
    if(embeddingLayers_.size() <= batchIndex_)
      embeddingLayers_.resize(batchIndex_ + 1);
    if(ulr)
      embeddingLayers_[batchIndex_] = createULREmbeddingLayer();  // embedding uses ULR
    else
      embeddingLayers_[batchIndex_] = createEmbeddingLayer();
  }
  return embeddingLayers_[batchIndex_];
}

}  // namespace marian