.. _program_listing_file_src_data_corpus.h: Program Listing for File corpus.h ================================= |exhale_lsh| :ref:`Return to documentation for file ` (``src/data/corpus.h``) .. |exhale_lsh| unicode:: U+021B0 .. UPWARDS ARROW WITH TIP LEFTWARDS .. code-block:: cpp #pragma once #include #include #include #include "3rd_party/threadpool.h" #include "common/definitions.h" #include "common/file_stream.h" #include "common/options.h" #include "data/alignment.h" #include "data/batch.h" #include "data/corpus_base.h" #include "data/dataset.h" #include "data/vocab.h" namespace marian { namespace data { class Corpus : public CorpusBase { private: std::vector> tempFiles_; std::vector ids_; UPtr threadPool_; // thread pool for parallelized data reading // for shuffle-in-ram bool shuffleInRAM_{false}; std::vector> corpusInRAM_; // // [stream][id] full copy of all data files void shuffleData(const std::vector& paths); // for pre-processing size_t allCapsEvery_{0}; // if set, convert every N-th input sentence (after randomization) to all-caps (source and target) size_t titleCaseEvery_{0}; // ditto for title case (source only) void preprocessLine(std::string& line, size_t streamId, bool& altered); // altered => whether the segmentation was altered in marian public: // @TODO: check if translate can be replaced by an option in options Corpus(Ptr options, bool translate = false, size_t seed = Config::seed); Corpus(std::vector paths, std::vector> vocabs, Ptr options, size_t seed = Config::seed); Sample next() override; void shuffle() override; void reset() override; void restore(Ptr) override; iterator begin() override { return iterator(this); } iterator end() override { return iterator(); } std::vector>& getVocabs() override { return vocabs_; } batch_ptr toBatch(const std::vector& batchVector) override; }; } // namespace data } // namespace marian