Program Listing for File types.h

Return to documentation for file (src/data/types.h)

#pragma once

#include "common/definitions.h"

#include <cstdint>
#include <cstdlib>
#include <string>
#include <unordered_map>
#include <vector>
#include <iterator>

namespace marian {

// Type for all vocabulary items, based on IndexType
typedef IndexType WordIndex;    // WordIndex is used for words or tokens arranged in consecutive order
class Word {                    // Word is an abstraction of a unique id, not necessarily consecutive
  WordIndex wordId_;
  explicit Word(std::size_t wordId) : wordId_((WordIndex)wordId) {}
public:
  static Word fromWordIndex(std::size_t wordId) { return Word(wordId); }
  const WordIndex& toWordIndex() const { return wordId_; }
  std::string toString() const { return std::to_string(wordId_); }

  // needed for STL containers
  Word() : wordId_((WordIndex)-1) {}
  bool operator==(const Word& other) const { return wordId_ == other.wordId_; }
  bool operator!=(const Word& other) const { return !(*this == other); }
  bool operator<(const Word& other) const { return wordId_ < other.wordId_; }
  std::size_t hash() const { return std::hash<WordIndex>{}(wordId_); }

  // constants
  static Word NONE; // @TODO: decide whether we need this, in additional Word()
  static Word ZERO; // an invalid word that nevertheless can safely be looked up (and then masked out)
  // EOS and UNK are placed in these positions in Marian-generated vocabs
  static Word DEFAULT_EOS_ID;
  static Word DEFAULT_UNK_ID;
};

// Sequence of vocabulary items
typedef std::vector<Word> Words;

// Helper to map a Word vector to a WordIndex vector
static inline std::vector<WordIndex> toWordIndexVector(const Words& words) {
  std::vector<WordIndex> res;
  std::transform(words.begin(), words.end(), std::back_inserter(res),
                 [](const Word& word) -> WordIndex { return word.toWordIndex(); });
  return res;
}

// names of EOS and UNK symbols
const std::string DEFAULT_EOS_STR = "</s>";
const std::string DEFAULT_UNK_STR = "<unk>";

// alternatively accepted names in Yaml dictionaries for ids 0 and 1, resp.
const std::string NEMATUS_EOS_STR = "eos";
const std::string NEMATUS_UNK_STR = "UNK";

}  // namespace marian

namespace std {
  template<> struct hash<marian::Word> {
    std::size_t  operator()(const marian::Word& s) const noexcept { return s.hash(); }
  };
}