Program Listing for File vocab_base.h

Return to documentation for file (src/data/vocab_base.h)

#pragma once

#include "data/types.h"
#include "common/definitions.h"
#include "common/utils.h"
#include "common/file_stream.h"

namespace marian {

class IVocab {
public:
  virtual size_t load(const std::string& vocabPath, size_t maxSize = 0) = 0;

  virtual void create(const std::string& vocabPath,
                      const std::vector<std::string>& trainPaths,
                      size_t maxSize) = 0;

  // return canonical suffix for given type of vocabulary
  virtual const std::string& canonicalExtension() const = 0;
  virtual const std::vector<std::string>& suffixes() const = 0;

  size_t findAndLoad(const std::string& path, size_t maxSize) { // @TODO: Only used in one place; just inline it there -> true interface
    for(auto suffix : suffixes())
      if(filesystem::exists(path + suffix))
        return load(path + suffix, maxSize);
    return 0;
  }

  virtual Word operator[](const std::string& word) const = 0;

  virtual Words encode(const std::string& line,
                       bool addEOS = true,
                       bool inference = false) const = 0;

  virtual std::string decode(const Words& sentence,
                             bool ignoreEos = true) const = 0;
  virtual std::string surfaceForm(const Words& sentence) const = 0;

  virtual const std::string& operator[](Word id) const = 0;

  virtual size_t size() const = 0;
  virtual size_t lemmaSize() const { return size(); }
  virtual std::string type() const = 0;

  virtual Word getEosId() const = 0;
  virtual Word getUnkId() const = 0;

  // without specific knowledge of tokenization, these two functions can do nothing
  // Both SentencePieceVocab and FactoredSegmenterVocab
  virtual std::string toUpper(const std::string& line) const { return line; }
  virtual std::string toEnglishTitleCase(const std::string& line) const { return line; }

  // Identity mapping for default vocabularies, hence do nothing
  virtual void transcodeToShortlistInPlace(WordIndex* ptr, size_t num) const { ptr; num; }

  // Populates vector `special` with special words like "\n" etc.
  virtual void addSpecialWords(std::vector<Word>& special) const { special; }

  virtual void createFake() = 0;

  virtual Word randWord() const {
    return Word::fromWordIndex(rand() % size());
  }
  virtual ~IVocab() {};
};

class Options;
Ptr<IVocab> createDefaultVocab();
Ptr<IVocab> createClassVocab();
Ptr<IVocab> createSentencePieceVocab(const std::string& vocabPath, Ptr<Options>, size_t batchIndex);
Ptr<IVocab> createFactoredVocab(const std::string& vocabPath);

}