.. _program_listing_file_src_data_vocab_base.h: Program Listing for File vocab_base.h ===================================== |exhale_lsh| :ref:`Return to documentation for file ` (``src/data/vocab_base.h``) .. |exhale_lsh| unicode:: U+021B0 .. UPWARDS ARROW WITH TIP LEFTWARDS .. code-block:: cpp #pragma once #include "data/types.h" #include "common/definitions.h" #include "common/utils.h" #include "common/file_stream.h" namespace marian { class IVocab { public: virtual size_t load(const std::string& vocabPath, size_t maxSize = 0) = 0; virtual void create(const std::string& vocabPath, const std::vector& trainPaths, size_t maxSize) = 0; // return canonical suffix for given type of vocabulary virtual const std::string& canonicalExtension() const = 0; virtual const std::vector& suffixes() const = 0; size_t findAndLoad(const std::string& path, size_t maxSize) { // @TODO: Only used in one place; just inline it there -> true interface for(auto suffix : suffixes()) if(filesystem::exists(path + suffix)) return load(path + suffix, maxSize); return 0; } virtual Word operator[](const std::string& word) const = 0; virtual Words encode(const std::string& line, bool addEOS = true, bool inference = false) const = 0; virtual std::string decode(const Words& sentence, bool ignoreEos = true) const = 0; virtual std::string surfaceForm(const Words& sentence) const = 0; virtual const std::string& operator[](Word id) const = 0; virtual size_t size() const = 0; virtual size_t lemmaSize() const { return size(); } virtual std::string type() const = 0; virtual Word getEosId() const = 0; virtual Word getUnkId() const = 0; // without specific knowledge of tokenization, these two functions can do nothing // Both SentencePieceVocab and FactoredSegmenterVocab virtual std::string toUpper(const std::string& line) const { return line; } virtual std::string toEnglishTitleCase(const std::string& line) const { return line; } // Identity mapping for default vocabularies, hence do nothing virtual void transcodeToShortlistInPlace(WordIndex* ptr, size_t num) const { ptr; num; } // Populates vector `special` with special words like "\n" etc. virtual void addSpecialWords(std::vector& special) const { special; } virtual void createFake() = 0; virtual Word randWord() const { return Word::fromWordIndex(rand() % size()); } virtual ~IVocab() {}; }; class Options; Ptr createDefaultVocab(); Ptr createClassVocab(); Ptr createSentencePieceVocab(const std::string& vocabPath, Ptr, size_t batchIndex); Ptr createFactoredVocab(const std::string& vocabPath); }