.. _program_listing_file_src_data_vocab.h: Program Listing for File vocab.h ================================ |exhale_lsh| :ref:`Return to documentation for file ` (``src/data/vocab.h``) .. |exhale_lsh| unicode:: U+021B0 .. UPWARDS ARROW WITH TIP LEFTWARDS .. code-block:: cpp #pragma once #include "common/definitions.h" #include "data/types.h" #include "common/options.h" #include "common/file_stream.h" namespace marian { class IVocab; // Wrapper around vocabulary types. Can choose underlying // vocabulary implementation (vImpl_) based on specified path // and suffix. // Vocabulary implementations can currently be: // * DefaultVocabulary for YAML (*.yml and *.yaml) and TXT (any other non-specific ending) // * SentencePiece with suffix *.spm (works, but has to be created outside Marian) class Vocab { private: Ptr vImpl_; Ptr options_; size_t batchIndex_; public: Vocab(Ptr options, size_t batchIndex) : options_(options), batchIndex_(batchIndex) {} size_t loadOrCreate(const std::string& vocabPath, const std::vector& trainPaths, size_t maxSize = 0); size_t load(const std::string& vocabPath, size_t maxSize = 0); void create(const std::string& vocabPath, const std::vector& trainPaths, size_t maxSize); void create(const std::string& vocabPath, const std::string& trainPath, size_t maxSize); // string token to token id Word operator[](const std::string& word) const; // token index to string token const std::string& operator[](Word word) const; // line of text to list of token ids, can perform tokenization Words encode(const std::string& line, bool addEOS = true, bool inference = false) const; // convert sequence of token ids to single line, can perform detokenization std::string decode(const Words& sentence, bool ignoreEOS = true) const; // convert sequence of token its to surface form (incl. removng spaces, applying factors) // for in-process BLEU validation std::string surfaceForm(const Words& sentence) const; // number of vocabulary items size_t size() const; // number of lemma items. Same as size() except in factored models size_t lemmaSize() const; // number of vocabulary items std::string type() const; // return EOS symbol id Word getEosId() const; // return UNK symbol id Word getUnkId() const; // return a set of Word ids that should be suppressed based on the underlying vocabulary implementation. // Arguments mosty likely provided based on outside options like --allow-unk etc. std::vector suppressedIds(bool suppressUnk = true, bool suppressSpecial = true) const; // same as suppressedIds but return numeric word indices into the embedding matrices std::vector suppressedIndices(bool suppressUnk = true, bool suppressSpecial = true) const; // for corpus augmentation: convert string to all-caps // @TODO: Consider a different implementation where this does not show on the vocab interface, // but instead as additional options passed to vocab instantiation. std::string toUpper(const std::string& line) const; // for corpus augmentation: convert string to title case std::string toEnglishTitleCase(const std::string& line) const; // for short-list generation void transcodeToShortlistInPlace(WordIndex* ptr, size_t num) const; // create fake vocabulary for collecting batch statistics void createFake(); // generate a fake word (using rand()) Word randWord(); // give access to base implementation. Returns null if not the requested type. template // e.g. FactoredVocab Ptr tryAs() const { return std::dynamic_pointer_cast(vImpl_); } }; } // namespace marian